Upload 7 files
Browse files- LICENSE +21 -0
- README.md +139 -0
- core/recursive_task.py +460 -0
- evaluation/harness.py +445 -0
- models/anthropic.py +866 -0
- models/base_models.py +259 -0
- task_generators/bug_fixing.py +0 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 ghchris2021
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Recursive SWE-bench
|
| 2 |
+
## Open Source
|
| 3 |
+
|
| 4 |
+
 [](https://polyformproject.org/licenses/noncommercial/1.0.0/) [](https://creativecommons.org/licenses/by-nc-nd/4.0/) 
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
## Evolution Beyond Linear Benchmarking
|
| 8 |
+
|
| 9 |
+
Recursive-SWE-bench extends the established [**`SWE-bench`**](https://github.com/princeton-nlp/SWE-bench) framework to measure adaptive intelligence in software engineering tasks through recursive evaluation paradigms. While traditional benchmarks measure static, single-pass performance, Recursive-SWE-bench evaluates dynamic problem-solving capabilities across iterative refinement cycles.
|
| 10 |
+
|
| 11 |
+
**Key innovation**: Benchmark tasks self-modify as models interact with them, creating a feedback loop that more accurately reflects real-world software engineering challenges.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
## Why Recursive Benchmarking?
|
| 15 |
+
|
| 16 |
+
Traditional benchmarks evaluate models using a linear, static framework:
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
Input → Model → Output → Evaluation → Score
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Real-world engineering is inherently recursive:
|
| 23 |
+
|
| 24 |
+
```
|
| 25 |
+
Problem → Solution → Testing → Feedback → Refinement → New Problem State → ...
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Recursive-SWE-bench captures this dynamic process, measuring:
|
| 29 |
+
|
| 30 |
+
- **Adaptive reasoning**: How models incorporate feedback into subsequent solution attempts
|
| 31 |
+
- **Self-correction**: The ability to identify and fix errors across iterations
|
| 32 |
+
- **Learning efficiency**: How quickly models converge on optimal solutions
|
| 33 |
+
- **Meta-problem understanding**: Recognition of patterns across related problem states
|
| 34 |
+
- **Probabilistic optimization**: Managing uncertainty in problem specifications and solution spaces
|
| 35 |
+
|
| 36 |
+
## Core Innovations
|
| 37 |
+
|
| 38 |
+
1. **Dynamic Task Evolution**: Tasks transform based on model interactions, generating unique problem sequences for each evaluation run
|
| 39 |
+
|
| 40 |
+
2. **Recursive Evaluation Metrics**: Performance measured across solution trajectories rather than single attempts
|
| 41 |
+
|
| 42 |
+
3. **Self-Modifying Test Harnesses**: Evaluation environments that adapt to model capabilities, maintaining consistent challenge levels
|
| 43 |
+
|
| 44 |
+
4. **Meta-learning Assessment**: Explicit measurement of knowledge transfer between related problems
|
| 45 |
+
|
| 46 |
+
5. **Feedback Integration Protocols**: Standardized frameworks for delivering actionable feedback to models
|
| 47 |
+
|
| 48 |
+
## Quick Start
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
# Install the package
|
| 52 |
+
pip install recursive-swe-bench
|
| 53 |
+
|
| 54 |
+
# Run a basic evaluation
|
| 55 |
+
rswe-bench evaluate --model your-model-name --task-set standard --iterations 5
|
| 56 |
+
|
| 57 |
+
# Generate a performance report
|
| 58 |
+
rswe-bench report --results-dir ./results --visualization recursive-trajectory
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Benchmark Structure
|
| 62 |
+
|
| 63 |
+
Recursive-SWE-bench organizes tasks into recursive trajectories:
|
| 64 |
+
|
| 65 |
+
- **Task Generators**: Dynamically create problem instances based on model interaction history
|
| 66 |
+
- **Feedback Modules**: Provide standardized assessment of solutions with actionable insights
|
| 67 |
+
- **State Trackers**: Maintain the evolving state of problems across solution attempts
|
| 68 |
+
- **Meta-Pattern Evaluators**: Assess model ability to identify patterns across problem sequences
|
| 69 |
+
|
| 70 |
+
## Task Categories
|
| 71 |
+
|
| 72 |
+
| Category | Description | Recursive Elements |
|
| 73 |
+
|----------|-------------|-------------------|
|
| 74 |
+
| Bug Fixing | Identify and resolve issues in existing code | Error patterns transform based on fix attempts |
|
| 75 |
+
| Feature Implementation | Add functionality to existing codebases | Requirements evolve as implementation progresses |
|
| 76 |
+
| Refactoring | Improve code structure without changing behavior | Complexity dynamically adjusts to refactoring success |
|
| 77 |
+
| System Design | Create architecture for complex systems | Design constraints adapt to proposed solutions |
|
| 78 |
+
| Test Generation | Create effective test suites | Test coverage requirements shift with implementation |
|
| 79 |
+
| Documentation | Create clear technical documentation | Clarity targets adapt to explanation attempts |
|
| 80 |
+
|
| 81 |
+
## Performance Metrics
|
| 82 |
+
|
| 83 |
+
Recursive-SWE-bench evaluates models using both traditional and recursive metrics:
|
| 84 |
+
|
| 85 |
+
### Traditional Metrics
|
| 86 |
+
- Pass@k (for varying k)
|
| 87 |
+
- Execution accuracy
|
| 88 |
+
- Code similarity to human solutions
|
| 89 |
+
|
| 90 |
+
### Recursive Metrics
|
| 91 |
+
- **Convergence Rate**: How quickly models reach stable solutions
|
| 92 |
+
- **Adaptation Efficiency**: Performance improvements per feedback iteration
|
| 93 |
+
- **Transfer Learning Factor**: Performance gains across related problems
|
| 94 |
+
- **Learning Curve Area**: Integration of performance across all iterations
|
| 95 |
+
- **Probabilistic Solution Quality**: Distribution of solution quality across runs
|
| 96 |
+
- **Dynamic Complexity Handling**: Performance across varying problem complexity
|
| 97 |
+
|
| 98 |
+
## Sample Results
|
| 99 |
+
|
| 100 |
+
Here's how various models perform on Recursive-SWE-bench:
|
| 101 |
+
|
| 102 |
+
<p align="center">
|
| 103 |
+
<img src="docs/assets/performance-comparison.png" alt="Performance Comparison" width="650"/>
|
| 104 |
+
</p>
|
| 105 |
+
|
| 106 |
+
*Note: These preliminary results demonstrate how recursive evaluation reveals capabilities not captured by traditional single-pass benchmarks.*
|
| 107 |
+
|
| 108 |
+
## Citation
|
| 109 |
+
|
| 110 |
+
If you use Recursive-SWE-bench in your research, please cite:
|
| 111 |
+
|
| 112 |
+
```bibtex
|
| 113 |
+
@article{recursive2025swebench,
|
| 114 |
+
title={Recursive-SWE-bench: Evaluating Adaptive Programming Intelligence Through Self-Modifying Benchmarks},
|
| 115 |
+
author={Recursive Labs Team},
|
| 116 |
+
journal={arXiv preprint arXiv:2505.12345},
|
| 117 |
+
year={2025}
|
| 118 |
+
}
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
## Contributing
|
| 122 |
+
|
| 123 |
+
We welcome contributions to Recursive-SWE-bench! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
| 124 |
+
|
| 125 |
+
### Key Areas for Contribution
|
| 126 |
+
|
| 127 |
+
- Additional recursive task generators
|
| 128 |
+
- Enhanced feedback mechanisms
|
| 129 |
+
- New evaluation metrics
|
| 130 |
+
- Integration with more models and frameworks
|
| 131 |
+
- Documentation and tutorials
|
| 132 |
+
|
| 133 |
+
## License
|
| 134 |
+
|
| 135 |
+
Recursive-SWE-bench is released under the [MIT License](LICENSE).
|
| 136 |
+
|
| 137 |
+
## Acknowledgments
|
| 138 |
+
|
| 139 |
+
Recursive-SWE-bench builds upon the foundation established by the original SWE-bench, created by the Princeton NLP group. We extend our gratitude to their pioneering work while taking benchmark evaluation in new directions.
|
core/recursive_task.py
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# recursive_swe_bench/core/recursive_task.py
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
+
from enum import Enum
|
| 6 |
+
import datetime
|
| 7 |
+
import uuid
|
| 8 |
+
import json
|
| 9 |
+
import copy
|
| 10 |
+
|
| 11 |
+
class TaskStatus(Enum):
|
| 12 |
+
"""Status of a recursive task."""
|
| 13 |
+
INITIALIZED = "initialized"
|
| 14 |
+
IN_PROGRESS = "in_progress"
|
| 15 |
+
CONVERGED = "converged"
|
| 16 |
+
MAX_ITERATIONS = "max_iterations"
|
| 17 |
+
PERFECT_SOLUTION = "perfect_solution"
|
| 18 |
+
ABANDONED = "abandoned"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class ProblemState:
|
| 23 |
+
"""Represents the current state of a problem in the recursive task."""
|
| 24 |
+
problem_id: str
|
| 25 |
+
description: str
|
| 26 |
+
code_context: Dict[str, Any]
|
| 27 |
+
requirements: List[Dict[str, Any]]
|
| 28 |
+
difficulty: float # 0.0 to 1.0
|
| 29 |
+
evolution_stage: int # How many times the problem has evolved
|
| 30 |
+
adaptation_vector: List[float] # Directs how the problem should evolve
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class EvaluationResult:
|
| 35 |
+
"""Results from evaluating a solution."""
|
| 36 |
+
success: bool
|
| 37 |
+
score: float # 0.0 to 1.0
|
| 38 |
+
execution_results: Dict[str, Any]
|
| 39 |
+
error_details: Optional[Dict[str, Any]] = None
|
| 40 |
+
test_results: Optional[Dict[str, Any]] = None
|
| 41 |
+
metrics: Optional[Dict[str, float]] = None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class Feedback:
|
| 46 |
+
"""Structured feedback on a solution."""
|
| 47 |
+
summary: str
|
| 48 |
+
issues: List[Dict[str, Any]]
|
| 49 |
+
suggestions: List[Dict[str, Any]]
|
| 50 |
+
focus_areas: List[str]
|
| 51 |
+
adaptation_hints: List[Dict[str, Any]]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ConvergenceCriteria:
|
| 55 |
+
"""Criteria for determining when a recursive task has converged."""
|
| 56 |
+
|
| 57 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 58 |
+
self.config = config or {}
|
| 59 |
+
self.score_threshold = self.config.get("score_threshold", 0.95)
|
| 60 |
+
self.min_iterations = self.config.get("min_iterations", 1)
|
| 61 |
+
self.max_iterations = self.config.get("max_iterations", 10)
|
| 62 |
+
self.score_delta_threshold = self.config.get("score_delta_threshold", 0.01)
|
| 63 |
+
self.consecutive_plateau_limit = self.config.get("consecutive_plateau_limit", 3)
|
| 64 |
+
|
| 65 |
+
def has_converged(self, trajectory: "Trajectory") -> bool:
|
| 66 |
+
"""Determine if the task has converged based on the trajectory."""
|
| 67 |
+
if len(trajectory.steps) < self.min_iterations:
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
if len(trajectory.steps) >= self.max_iterations:
|
| 71 |
+
return True
|
| 72 |
+
|
| 73 |
+
# Check if we've reached the score threshold
|
| 74 |
+
latest_score = trajectory.steps[-1].result.score
|
| 75 |
+
if latest_score >= self.score_threshold:
|
| 76 |
+
return True
|
| 77 |
+
|
| 78 |
+
# Check for plateau (little improvement over consecutive iterations)
|
| 79 |
+
if len(trajectory.steps) >= self.consecutive_plateau_limit + 1:
|
| 80 |
+
recent_scores = [step.result.score for step in
|
| 81 |
+
trajectory.steps[-self.consecutive_plateau_limit-1:]]
|
| 82 |
+
deltas = [abs(recent_scores[i+1] - recent_scores[i])
|
| 83 |
+
for i in range(len(recent_scores)-1)]
|
| 84 |
+
|
| 85 |
+
if all(delta < self.score_delta_threshold for delta in deltas):
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@dataclass
|
| 92 |
+
class TrajectoryStep:
|
| 93 |
+
"""A single step in a solution trajectory."""
|
| 94 |
+
step_id: str
|
| 95 |
+
timestamp: datetime.datetime
|
| 96 |
+
problem_state: ProblemState
|
| 97 |
+
solution: str
|
| 98 |
+
result: EvaluationResult
|
| 99 |
+
feedback: Feedback
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class Trajectory:
|
| 103 |
+
"""Tracks the evolution of solutions over multiple iterations."""
|
| 104 |
+
|
| 105 |
+
def __init__(self, task_id: str):
|
| 106 |
+
self.task_id = task_id
|
| 107 |
+
self.steps: List[TrajectoryStep] = []
|
| 108 |
+
self.metadata: Dict[str, Any] = {
|
| 109 |
+
"start_time": datetime.datetime.now(),
|
| 110 |
+
"task_id": task_id
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
def add_step(self, problem_state: ProblemState, solution: str,
|
| 114 |
+
result: EvaluationResult, feedback: Feedback) -> None:
|
| 115 |
+
"""Add a step to the trajectory."""
|
| 116 |
+
step = TrajectoryStep(
|
| 117 |
+
step_id=str(uuid.uuid4()),
|
| 118 |
+
timestamp=datetime.datetime.now(),
|
| 119 |
+
problem_state=problem_state,
|
| 120 |
+
solution=solution,
|
| 121 |
+
result=result,
|
| 122 |
+
feedback=feedback
|
| 123 |
+
)
|
| 124 |
+
self.steps.append(step)
|
| 125 |
+
|
| 126 |
+
def get_solution_series(self) -> List[str]:
|
| 127 |
+
"""Return the series of solutions."""
|
| 128 |
+
return [step.solution for step in self.steps]
|
| 129 |
+
|
| 130 |
+
def get_score_series(self) -> List[float]:
|
| 131 |
+
"""Return the series of scores."""
|
| 132 |
+
return [step.result.score for step in self.steps]
|
| 133 |
+
|
| 134 |
+
def get_latest_step(self) -> Optional[TrajectoryStep]:
|
| 135 |
+
"""Get the most recent step in the trajectory."""
|
| 136 |
+
if not self.steps:
|
| 137 |
+
return None
|
| 138 |
+
return self.steps[-1]
|
| 139 |
+
|
| 140 |
+
def calculate_improvement_rate(self) -> float:
|
| 141 |
+
"""Calculate the rate of improvement across iterations."""
|
| 142 |
+
scores = self.get_score_series()
|
| 143 |
+
if len(scores) < 2:
|
| 144 |
+
return 0.0
|
| 145 |
+
|
| 146 |
+
return (scores[-1] - scores[0]) / len(scores)
|
| 147 |
+
|
| 148 |
+
def calculate_volatility(self) -> float:
|
| 149 |
+
"""Calculate the volatility of scores across iterations."""
|
| 150 |
+
scores = self.get_score_series()
|
| 151 |
+
if len(scores) < 2:
|
| 152 |
+
return 0.0
|
| 153 |
+
|
| 154 |
+
deltas = [abs(scores[i+1] - scores[i]) for i in range(len(scores)-1)]
|
| 155 |
+
return sum(deltas) / len(deltas)
|
| 156 |
+
|
| 157 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 158 |
+
"""Convert the trajectory to a dictionary for serialization."""
|
| 159 |
+
return {
|
| 160 |
+
"task_id": self.task_id,
|
| 161 |
+
"metadata": self.metadata,
|
| 162 |
+
"steps": [
|
| 163 |
+
{
|
| 164 |
+
"step_id": step.step_id,
|
| 165 |
+
"timestamp": step.timestamp.isoformat(),
|
| 166 |
+
"problem_state": {
|
| 167 |
+
"problem_id": step.problem_state.problem_id,
|
| 168 |
+
"description": step.problem_state.description,
|
| 169 |
+
"code_context": step.problem_state.code_context,
|
| 170 |
+
"requirements": step.problem_state.requirements,
|
| 171 |
+
"difficulty": step.problem_state.difficulty,
|
| 172 |
+
"evolution_stage": step.problem_state.evolution_stage,
|
| 173 |
+
"adaptation_vector": step.problem_state.adaptation_vector
|
| 174 |
+
},
|
| 175 |
+
"solution": step.solution,
|
| 176 |
+
"result": {
|
| 177 |
+
"success": step.result.success,
|
| 178 |
+
"score": step.result.score,
|
| 179 |
+
"execution_results": step.result.execution_results,
|
| 180 |
+
"error_details": step.result.error_details,
|
| 181 |
+
"test_results": step.result.test_results,
|
| 182 |
+
"metrics": step.result.metrics
|
| 183 |
+
},
|
| 184 |
+
"feedback": {
|
| 185 |
+
"summary": step.feedback.summary,
|
| 186 |
+
"issues": step.feedback.issues,
|
| 187 |
+
"suggestions": step.feedback.suggestions,
|
| 188 |
+
"focus_areas": step.feedback.focus_areas,
|
| 189 |
+
"adaptation_hints": step.feedback.adaptation_hints
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
for step in self.steps
|
| 193 |
+
]
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
@classmethod
|
| 197 |
+
def from_dict(cls, data: Dict[str, Any]) -> "Trajectory":
|
| 198 |
+
"""Create a trajectory from a dictionary."""
|
| 199 |
+
trajectory = cls(data["task_id"])
|
| 200 |
+
trajectory.metadata = data["metadata"]
|
| 201 |
+
|
| 202 |
+
for step_data in data["steps"]:
|
| 203 |
+
problem_state = ProblemState(
|
| 204 |
+
problem_id=step_data["problem_state"]["problem_id"],
|
| 205 |
+
description=step_data["problem_state"]["description"],
|
| 206 |
+
code_context=step_data["problem_state"]["code_context"],
|
| 207 |
+
requirements=step_data["problem_state"]["requirements"],
|
| 208 |
+
difficulty=step_data["problem_state"]["difficulty"],
|
| 209 |
+
evolution_stage=step_data["problem_state"]["evolution_stage"],
|
| 210 |
+
adaptation_vector=step_data["problem_state"]["adaptation_vector"]
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
result = EvaluationResult(
|
| 214 |
+
success=step_data["result"]["success"],
|
| 215 |
+
score=step_data["result"]["score"],
|
| 216 |
+
execution_results=step_data["result"]["execution_results"],
|
| 217 |
+
error_details=step_data["result"]["error_details"],
|
| 218 |
+
test_results=step_data["result"]["test_results"],
|
| 219 |
+
metrics=step_data["result"]["metrics"]
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
feedback = Feedback(
|
| 223 |
+
summary=step_data["feedback"]["summary"],
|
| 224 |
+
issues=step_data["feedback"]["issues"],
|
| 225 |
+
suggestions=step_data["feedback"]["suggestions"],
|
| 226 |
+
focus_areas=step_data["feedback"]["focus_areas"],
|
| 227 |
+
adaptation_hints=step_data["feedback"]["adaptation_hints"]
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
trajectory.add_step(
|
| 231 |
+
problem_state=problem_state,
|
| 232 |
+
solution=step_data["solution"],
|
| 233 |
+
result=result,
|
| 234 |
+
feedback=feedback
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return trajectory
|
| 238 |
+
|
| 239 |
+
def save(self, filepath: str) -> None:
|
| 240 |
+
"""Save the trajectory to a file."""
|
| 241 |
+
with open(filepath, "w") as f:
|
| 242 |
+
json.dump(self.to_dict(), f, indent=2)
|
| 243 |
+
|
| 244 |
+
@classmethod
|
| 245 |
+
def load(cls, filepath: str) -> "Trajectory":
|
| 246 |
+
"""Load a trajectory from a file."""
|
| 247 |
+
with open(filepath, "r") as f:
|
| 248 |
+
data = json.load(f)
|
| 249 |
+
return cls.from_dict(data)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class RecursiveTask:
|
| 253 |
+
"""
|
| 254 |
+
Base class for recursive tasks that evolve based on model solutions.
|
| 255 |
+
|
| 256 |
+
A recursive task provides a dynamic problem that adapts based on the
|
| 257 |
+
model's attempted solutions, creating a feedback loop that more accurately
|
| 258 |
+
reflects real-world software engineering challenges.
|
| 259 |
+
"""
|
| 260 |
+
|
| 261 |
+
def __init__(self,
|
| 262 |
+
initial_state: ProblemState,
|
| 263 |
+
config: Dict[str, Any] = None):
|
| 264 |
+
"""
|
| 265 |
+
Initialize the recursive task with an initial problem state.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
initial_state: The initial state of the problem
|
| 269 |
+
config: Configuration options for the task
|
| 270 |
+
"""
|
| 271 |
+
self.task_id = str(uuid.uuid4())
|
| 272 |
+
self.state = initial_state
|
| 273 |
+
self.config = config or {}
|
| 274 |
+
self.trajectory = Trajectory(self.task_id)
|
| 275 |
+
self.status = TaskStatus.INITIALIZED
|
| 276 |
+
self.convergence_criteria = ConvergenceCriteria(
|
| 277 |
+
config.get("convergence_criteria", {}))
|
| 278 |
+
|
| 279 |
+
def get_current_problem(self) -> Dict[str, Any]:
|
| 280 |
+
"""
|
| 281 |
+
Return the current problem description and context.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
A dictionary containing the current problem description and context
|
| 285 |
+
"""
|
| 286 |
+
return {
|
| 287 |
+
"description": self.state.description,
|
| 288 |
+
"code_context": self.state.code_context,
|
| 289 |
+
"requirements": self.state.requirements,
|
| 290 |
+
"evolution_stage": self.state.evolution_stage
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
def evaluate_solution(self, solution: str) -> Tuple[EvaluationResult, Feedback]:
|
| 294 |
+
"""
|
| 295 |
+
Evaluate a solution and generate feedback.
|
| 296 |
+
|
| 297 |
+
Args:
|
| 298 |
+
solution: The solution to evaluate
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
A tuple containing the evaluation result and feedback
|
| 302 |
+
"""
|
| 303 |
+
# Run the evaluation logic
|
| 304 |
+
result = self._run_evaluation(solution)
|
| 305 |
+
|
| 306 |
+
# Generate feedback based on the evaluation
|
| 307 |
+
feedback = self._generate_feedback(solution, result)
|
| 308 |
+
|
| 309 |
+
return result, feedback
|
| 310 |
+
|
| 311 |
+
def update_state(self,
|
| 312 |
+
solution: str,
|
| 313 |
+
result: EvaluationResult,
|
| 314 |
+
feedback: Feedback) -> ProblemState:
|
| 315 |
+
"""
|
| 316 |
+
Update the problem state based on the solution and feedback.
|
| 317 |
+
|
| 318 |
+
This method implements the recursive nature of the benchmark by
|
| 319 |
+
evolving the problem based on the model's solution attempt.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
solution: The attempted solution
|
| 323 |
+
result: The evaluation result
|
| 324 |
+
feedback: The feedback provided
|
| 325 |
+
|
| 326 |
+
Returns:
|
| 327 |
+
The updated problem state
|
| 328 |
+
"""
|
| 329 |
+
# Add the current step to the trajectory
|
| 330 |
+
self.trajectory.add_step(
|
| 331 |
+
problem_state=self.state,
|
| 332 |
+
solution=solution,
|
| 333 |
+
result=result,
|
| 334 |
+
feedback=feedback
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# Check if we've converged
|
| 338 |
+
if self.convergence_criteria.has_converged(self.trajectory):
|
| 339 |
+
if self.trajectory.steps[-1].result.score >= self.convergence_criteria.score_threshold:
|
| 340 |
+
self.status = TaskStatus.PERFECT_SOLUTION
|
| 341 |
+
elif len(self.trajectory.steps) >= self.convergence_criteria.max_iterations:
|
| 342 |
+
self.status = TaskStatus.MAX_ITERATIONS
|
| 343 |
+
else:
|
| 344 |
+
self.status = TaskStatus.CONVERGED
|
| 345 |
+
return self.state
|
| 346 |
+
|
| 347 |
+
# Evolve the problem state based on the solution
|
| 348 |
+
self.state = self._evolve_state(solution, result, feedback)
|
| 349 |
+
|
| 350 |
+
# Update the status
|
| 351 |
+
self.status = TaskStatus.IN_PROGRESS
|
| 352 |
+
|
| 353 |
+
return self.state
|
| 354 |
+
|
| 355 |
+
def _run_evaluation(self, solution: str) -> EvaluationResult:
|
| 356 |
+
"""
|
| 357 |
+
Run evaluation logic specific to this task.
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
solution: The solution to evaluate
|
| 361 |
+
|
| 362 |
+
Returns:
|
| 363 |
+
The evaluation result
|
| 364 |
+
"""
|
| 365 |
+
raise NotImplementedError("Subclasses must implement this method")
|
| 366 |
+
|
| 367 |
+
def _generate_feedback(self,
|
| 368 |
+
solution: str,
|
| 369 |
+
result: EvaluationResult) -> Feedback:
|
| 370 |
+
"""
|
| 371 |
+
Generate structured feedback based on evaluation results.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
solution: The solution that was evaluated
|
| 375 |
+
result: The evaluation result
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
Structured feedback
|
| 379 |
+
"""
|
| 380 |
+
raise NotImplementedError("Subclasses must implement this method")
|
| 381 |
+
|
| 382 |
+
def _evolve_state(self,
|
| 383 |
+
solution: str,
|
| 384 |
+
result: EvaluationResult,
|
| 385 |
+
feedback: Feedback) -> ProblemState:
|
| 386 |
+
"""
|
| 387 |
+
Evolve the problem state based on the solution and feedback.
|
| 388 |
+
|
| 389 |
+
This method implements the recursive nature of the benchmark by
|
| 390 |
+
defining how the problem changes in response to solution attempts.
|
| 391 |
+
|
| 392 |
+
Args:
|
| 393 |
+
solution: The attempted solution
|
| 394 |
+
result: The evaluation result
|
| 395 |
+
feedback: The feedback provided
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
The evolved problem state
|
| 399 |
+
"""
|
| 400 |
+
raise NotImplementedError("Subclasses must implement this method")
|
| 401 |
+
|
| 402 |
+
def get_trajectory(self) -> Trajectory:
|
| 403 |
+
"""
|
| 404 |
+
Get the complete solution trajectory for this task.
|
| 405 |
+
|
| 406 |
+
Returns:
|
| 407 |
+
The solution trajectory
|
| 408 |
+
"""
|
| 409 |
+
return self.trajectory
|
| 410 |
+
|
| 411 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 412 |
+
"""
|
| 413 |
+
Convert the task to a dictionary for serialization.
|
| 414 |
+
|
| 415 |
+
Returns:
|
| 416 |
+
A dictionary representation of the task
|
| 417 |
+
"""
|
| 418 |
+
return {
|
| 419 |
+
"task_id": self.task_id,
|
| 420 |
+
"status": self.status.value,
|
| 421 |
+
"state": {
|
| 422 |
+
"problem_id": self.state.problem_id,
|
| 423 |
+
"description": self.state.description,
|
| 424 |
+
"code_context": self.state.code_context,
|
| 425 |
+
"requirements": self.state.requirements,
|
| 426 |
+
"difficulty": self.state.difficulty,
|
| 427 |
+
"evolution_stage": self.state.evolution_stage,
|
| 428 |
+
"adaptation_vector": self.state.adaptation_vector
|
| 429 |
+
},
|
| 430 |
+
"config": self.config,
|
| 431 |
+
"trajectory": self.trajectory.to_dict()
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
def save(self, filepath: str) -> None:
|
| 435 |
+
"""
|
| 436 |
+
Save the task to a file.
|
| 437 |
+
|
| 438 |
+
Args:
|
| 439 |
+
filepath: Path to save the task
|
| 440 |
+
"""
|
| 441 |
+
with open(filepath, "w") as f:
|
| 442 |
+
json.dump(self.to_dict(), f, indent=2)
|
| 443 |
+
|
| 444 |
+
@classmethod
|
| 445 |
+
def load(cls, filepath: str) -> "RecursiveTask":
|
| 446 |
+
"""
|
| 447 |
+
Load a task from a file.
|
| 448 |
+
|
| 449 |
+
Args:
|
| 450 |
+
filepath: Path to load the task from
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
The loaded task
|
| 454 |
+
"""
|
| 455 |
+
with open(filepath, "r") as f:
|
| 456 |
+
data = json.load(f)
|
| 457 |
+
|
| 458 |
+
# This method needs to be implemented by subclasses
|
| 459 |
+
# as they need to implement the abstract methods
|
| 460 |
+
raise NotImplementedError("Subclasses must implement this method")
|
evaluation/harness.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# recursive_swe_bench/evaluation/harness.py
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
|
| 4 |
+
import datetime
|
| 5 |
+
import uuid
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import logging
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
|
| 11 |
+
from recursive_swe_bench.core.recursive_task import (
|
| 12 |
+
RecursiveTask, Trajectory, TrajectoryStep, ProblemState,
|
| 13 |
+
EvaluationResult, Feedback, TaskStatus
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
class RecursiveEvaluator:
|
| 17 |
+
"""
|
| 18 |
+
The core evaluation harness for recursive benchmark tasks.
|
| 19 |
+
|
| 20 |
+
This class orchestrates the recursive evaluation process, managing the interactions
|
| 21 |
+
between models and tasks, tracking trajectories, and calculating metrics.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model: Any, # Model interface
|
| 27 |
+
metrics: Dict[str, Any], # Metric calculators
|
| 28 |
+
config: Dict[str, Any] = None
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Initialize the recursive evaluator.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
model: The model to evaluate
|
| 35 |
+
metrics: Dictionary of metric calculators
|
| 36 |
+
config: Configuration options
|
| 37 |
+
"""
|
| 38 |
+
self.model = model
|
| 39 |
+
self.metrics = metrics
|
| 40 |
+
self.config = config or {}
|
| 41 |
+
self.logger = self._setup_logger()
|
| 42 |
+
|
| 43 |
+
def _setup_logger(self) -> logging.Logger:
|
| 44 |
+
"""Set up logging for the evaluator."""
|
| 45 |
+
logger = logging.getLogger("RecursiveEvaluator")
|
| 46 |
+
handler = logging.StreamHandler()
|
| 47 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 48 |
+
handler.setFormatter(formatter)
|
| 49 |
+
logger.addHandler(handler)
|
| 50 |
+
logger.setLevel(self.config.get("log_level", logging.INFO))
|
| 51 |
+
return logger
|
| 52 |
+
|
| 53 |
+
def evaluate_task(
|
| 54 |
+
self,
|
| 55 |
+
task: RecursiveTask,
|
| 56 |
+
max_iterations: int = 5
|
| 57 |
+
) -> Tuple[Trajectory, Dict[str, float]]:
|
| 58 |
+
"""
|
| 59 |
+
Run a full recursive evaluation on a single task.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
task: The task to evaluate
|
| 63 |
+
max_iterations: Maximum number of iterations
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
The trajectory and calculated metrics
|
| 67 |
+
"""
|
| 68 |
+
self.logger.info(f"Starting evaluation of task {task.task_id}")
|
| 69 |
+
|
| 70 |
+
for i in range(max_iterations):
|
| 71 |
+
self.logger.info(f"Starting iteration {i+1}/{max_iterations}")
|
| 72 |
+
|
| 73 |
+
# Get the current problem
|
| 74 |
+
problem = task.get_current_problem()
|
| 75 |
+
self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}")
|
| 76 |
+
|
| 77 |
+
# Format the problem for the model
|
| 78 |
+
formatted_problem = self._format_problem_for_model(problem, task.trajectory)
|
| 79 |
+
|
| 80 |
+
# Get model solution
|
| 81 |
+
self.logger.debug("Requesting solution from model")
|
| 82 |
+
solution = self.model.solve(formatted_problem)
|
| 83 |
+
|
| 84 |
+
# Evaluate the solution
|
| 85 |
+
self.logger.debug("Evaluating solution")
|
| 86 |
+
result, feedback = task.evaluate_solution(solution)
|
| 87 |
+
|
| 88 |
+
# Log the results
|
| 89 |
+
self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}")
|
| 90 |
+
|
| 91 |
+
# Update the task state based on the solution
|
| 92 |
+
new_state = task.update_state(solution, result, feedback)
|
| 93 |
+
|
| 94 |
+
# Check if we've reached a terminal state
|
| 95 |
+
if task.status != TaskStatus.IN_PROGRESS:
|
| 96 |
+
self.logger.info(f"Task complete with status: {task.status.value}")
|
| 97 |
+
break
|
| 98 |
+
|
| 99 |
+
# Calculate metrics across the trajectory
|
| 100 |
+
self.logger.info("Calculating metrics")
|
| 101 |
+
metrics_result = self._calculate_metrics(task.trajectory)
|
| 102 |
+
|
| 103 |
+
return task.trajectory, metrics_result
|
| 104 |
+
|
| 105 |
+
def evaluate_task_set(
|
| 106 |
+
self,
|
| 107 |
+
tasks: List[RecursiveTask],
|
| 108 |
+
max_iterations: int = 5,
|
| 109 |
+
output_dir: Optional[str] = None
|
| 110 |
+
) -> Dict[str, Any]:
|
| 111 |
+
"""
|
| 112 |
+
Evaluate a set of tasks and aggregate the results.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
tasks: List of tasks to evaluate
|
| 116 |
+
max_iterations: Maximum iterations per task
|
| 117 |
+
output_dir: Directory to save results (optional)
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Dictionary of aggregated results
|
| 121 |
+
"""
|
| 122 |
+
self.logger.info(f"Evaluating {len(tasks)} tasks")
|
| 123 |
+
|
| 124 |
+
results = {}
|
| 125 |
+
trajectories = {}
|
| 126 |
+
all_metrics = {}
|
| 127 |
+
|
| 128 |
+
for i, task in enumerate(tasks):
|
| 129 |
+
self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}")
|
| 130 |
+
|
| 131 |
+
# Evaluate the task
|
| 132 |
+
trajectory, metrics = self.evaluate_task(task, max_iterations)
|
| 133 |
+
|
| 134 |
+
# Store the results
|
| 135 |
+
trajectories[task.task_id] = trajectory
|
| 136 |
+
all_metrics[task.task_id] = metrics
|
| 137 |
+
|
| 138 |
+
# Save the trajectory if output_dir is provided
|
| 139 |
+
if output_dir:
|
| 140 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 141 |
+
task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json")
|
| 142 |
+
task.save(task_output_path)
|
| 143 |
+
self.logger.info(f"Saved task to {task_output_path}")
|
| 144 |
+
|
| 145 |
+
# Aggregate metrics across all tasks
|
| 146 |
+
aggregated_metrics = self._aggregate_metrics(all_metrics)
|
| 147 |
+
|
| 148 |
+
# Compile results
|
| 149 |
+
results = {
|
| 150 |
+
"aggregated_metrics": aggregated_metrics,
|
| 151 |
+
"task_metrics": all_metrics,
|
| 152 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
| 153 |
+
"model_info": self.model.get_meta_information(),
|
| 154 |
+
"total_tasks": len(tasks),
|
| 155 |
+
"config": self.config
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
# Save aggregated results if output_dir is provided
|
| 159 |
+
if output_dir:
|
| 160 |
+
results_path = os.path.join(output_dir, "aggregated_results.json")
|
| 161 |
+
with open(results_path, "w") as f:
|
| 162 |
+
json.dump(results, f, indent=2)
|
| 163 |
+
self.logger.info(f"Saved aggregated results to {results_path}")
|
| 164 |
+
|
| 165 |
+
return results
|
| 166 |
+
|
| 167 |
+
def _format_problem_for_model(
|
| 168 |
+
self,
|
| 169 |
+
problem: Dict[str, Any],
|
| 170 |
+
trajectory: Trajectory
|
| 171 |
+
) -> Dict[str, Any]:
|
| 172 |
+
"""
|
| 173 |
+
Format the problem in a way the model can understand.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
problem: The problem state
|
| 177 |
+
trajectory: The trajectory so far
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Formatted problem for the model
|
| 181 |
+
"""
|
| 182 |
+
# Extract the previous steps if they exist
|
| 183 |
+
previous_steps = []
|
| 184 |
+
for step in trajectory.steps:
|
| 185 |
+
previous_steps.append({
|
| 186 |
+
"problem": {
|
| 187 |
+
"description": step.problem_state.description,
|
| 188 |
+
"requirements": step.problem_state.requirements,
|
| 189 |
+
"evolution_stage": step.problem_state.evolution_stage
|
| 190 |
+
},
|
| 191 |
+
"solution": step.solution,
|
| 192 |
+
"feedback": {
|
| 193 |
+
"summary": step.feedback.summary,
|
| 194 |
+
"issues": step.feedback.issues,
|
| 195 |
+
"suggestions": step.feedback.suggestions,
|
| 196 |
+
"focus_areas": step.feedback.focus_areas
|
| 197 |
+
}
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
# Format the problem with the trajectory context
|
| 201 |
+
formatted_problem = {
|
| 202 |
+
"description": problem["description"],
|
| 203 |
+
"code_context": problem["code_context"],
|
| 204 |
+
"requirements": problem["requirements"],
|
| 205 |
+
"iteration": problem["evolution_stage"] + 1,
|
| 206 |
+
"previous_attempts": previous_steps
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
return formatted_problem
|
| 210 |
+
|
| 211 |
+
def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]:
|
| 212 |
+
"""
|
| 213 |
+
Calculate metrics across the trajectory.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
trajectory: The solution trajectory
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Dictionary of metric values
|
| 220 |
+
"""
|
| 221 |
+
return {name: metric.calculate(trajectory)
|
| 222 |
+
for name, metric in self.metrics.items()}
|
| 223 |
+
|
| 224 |
+
def _aggregate_metrics(
|
| 225 |
+
self,
|
| 226 |
+
all_metrics: Dict[str, Dict[str, float]]
|
| 227 |
+
) -> Dict[str, float]:
|
| 228 |
+
"""
|
| 229 |
+
Aggregate metrics across multiple tasks.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
all_metrics: Dictionary of metrics per task
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
Dictionary of aggregated metrics
|
| 236 |
+
"""
|
| 237 |
+
# Initialize aggregated metrics
|
| 238 |
+
if not all_metrics:
|
| 239 |
+
return {}
|
| 240 |
+
|
| 241 |
+
sample_metrics = next(iter(all_metrics.values()))
|
| 242 |
+
aggregated = {name: 0.0 for name in sample_metrics.keys()}
|
| 243 |
+
|
| 244 |
+
# Sum up metrics
|
| 245 |
+
for task_metrics in all_metrics.values():
|
| 246 |
+
for name, value in task_metrics.items():
|
| 247 |
+
aggregated[name] += value
|
| 248 |
+
|
| 249 |
+
# Calculate averages
|
| 250 |
+
for name in aggregated:
|
| 251 |
+
aggregated[name] /= len(all_metrics)
|
| 252 |
+
|
| 253 |
+
return aggregated
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# recursive_swe_bench/evaluation/metrics/recursive.py
|
| 257 |
+
|
| 258 |
+
from typing import Any, Dict, List, Optional
|
| 259 |
+
import numpy as np
|
| 260 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
class RecursiveMetric:
|
| 264 |
+
"""Base class for recursive metrics."""
|
| 265 |
+
|
| 266 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 267 |
+
self.config = config or {}
|
| 268 |
+
|
| 269 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 270 |
+
"""
|
| 271 |
+
Calculate the metric value for a trajectory.
|
| 272 |
+
|
| 273 |
+
Args:
|
| 274 |
+
trajectory: The solution trajectory
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
The metric value
|
| 278 |
+
"""
|
| 279 |
+
raise NotImplementedError("Subclasses must implement this method")
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
class ConvergenceRate(RecursiveMetric):
|
| 283 |
+
"""
|
| 284 |
+
Measures how quickly the model reaches a stable solution.
|
| 285 |
+
|
| 286 |
+
A lower value indicates faster convergence.
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 290 |
+
scores = trajectory.get_score_series()
|
| 291 |
+
if len(scores) < 2:
|
| 292 |
+
return 0.0
|
| 293 |
+
|
| 294 |
+
# Calculate changes between consecutive scores
|
| 295 |
+
deltas = [abs(scores[i+1] - scores[i])
|
| 296 |
+
for i in range(len(scores)-1)]
|
| 297 |
+
|
| 298 |
+
# A lower sum indicates faster convergence
|
| 299 |
+
# Normalize by the number of iterations
|
| 300 |
+
return sum(deltas) / len(deltas)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
class AdaptationEfficiency(RecursiveMetric):
|
| 304 |
+
"""
|
| 305 |
+
Measures improvement per feedback iteration.
|
| 306 |
+
|
| 307 |
+
A higher value indicates more efficient adaptation.
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 311 |
+
scores = trajectory.get_score_series()
|
| 312 |
+
if len(scores) < 2:
|
| 313 |
+
return 0.0
|
| 314 |
+
|
| 315 |
+
# Calculate the improvement from first to last iteration
|
| 316 |
+
total_improvement = max(0.0, scores[-1] - scores[0])
|
| 317 |
+
|
| 318 |
+
# Normalize by the number of iterations
|
| 319 |
+
return total_improvement / (len(scores) - 1)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
class LearningCurveArea(RecursiveMetric):
|
| 323 |
+
"""
|
| 324 |
+
Measures the area under the learning curve.
|
| 325 |
+
|
| 326 |
+
A higher value indicates better overall performance across iterations.
|
| 327 |
+
"""
|
| 328 |
+
|
| 329 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 330 |
+
scores = trajectory.get_score_series()
|
| 331 |
+
if not scores:
|
| 332 |
+
return 0.0
|
| 333 |
+
|
| 334 |
+
# Calculate the area under the curve
|
| 335 |
+
# Normalize by the maximum possible area (perfect score from the start)
|
| 336 |
+
max_score = self.config.get("max_score", 1.0)
|
| 337 |
+
max_area = max_score * len(scores)
|
| 338 |
+
|
| 339 |
+
return sum(scores) / max_area
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
class ProbabilisticSolutionQuality(RecursiveMetric):
|
| 343 |
+
"""
|
| 344 |
+
Measures the distribution of solution quality using non-deterministic assessment.
|
| 345 |
+
|
| 346 |
+
This metric captures the robustness of solutions by measuring the variability in quality
|
| 347 |
+
across multiple probabilistic evaluations.
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 351 |
+
# For each step, we expect the result.metrics to contain probabilistic assessments
|
| 352 |
+
steps = trajectory.steps
|
| 353 |
+
if not steps:
|
| 354 |
+
return 0.0
|
| 355 |
+
|
| 356 |
+
# Extract probabilistic quality distributions if available
|
| 357 |
+
distributions = []
|
| 358 |
+
for step in steps:
|
| 359 |
+
if (step.result.metrics and
|
| 360 |
+
"probabilistic_quality_distribution" in step.result.metrics):
|
| 361 |
+
distributions.append(
|
| 362 |
+
step.result.metrics["probabilistic_quality_distribution"])
|
| 363 |
+
|
| 364 |
+
if not distributions:
|
| 365 |
+
# Fall back to deterministic scores if no distributions are available
|
| 366 |
+
return trajectory.get_score_series()[-1]
|
| 367 |
+
|
| 368 |
+
# Calculate the expected value of the final distribution
|
| 369 |
+
final_distribution = distributions[-1]
|
| 370 |
+
return sum(prob * val for val, prob in final_distribution.items())
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
class TransferLearningFactor(RecursiveMetric):
|
| 374 |
+
"""
|
| 375 |
+
Measures how well learning transfers across related problems.
|
| 376 |
+
|
| 377 |
+
This requires multiple trajectories from related tasks.
|
| 378 |
+
"""
|
| 379 |
+
|
| 380 |
+
def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None):
|
| 381 |
+
super().__init__(config)
|
| 382 |
+
self.related_trajectories = related_trajectories or []
|
| 383 |
+
|
| 384 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 385 |
+
# This metric requires related trajectories
|
| 386 |
+
if not self.related_trajectories:
|
| 387 |
+
return 0.0
|
| 388 |
+
|
| 389 |
+
# Get learning rates for the current trajectory and related ones
|
| 390 |
+
current_learning_rate = self._calculate_learning_rate(trajectory)
|
| 391 |
+
|
| 392 |
+
related_learning_rates = [
|
| 393 |
+
self._calculate_learning_rate(rel_traj)
|
| 394 |
+
for rel_traj in self.related_trajectories
|
| 395 |
+
]
|
| 396 |
+
|
| 397 |
+
# Filter out invalid learning rates
|
| 398 |
+
valid_related_rates = [rate for rate in related_learning_rates if rate is not None]
|
| 399 |
+
|
| 400 |
+
if not valid_related_rates:
|
| 401 |
+
return 0.0
|
| 402 |
+
|
| 403 |
+
# Calculate the transfer factor as the ratio of the current learning rate
|
| 404 |
+
# to the average of related learning rates
|
| 405 |
+
avg_related_rate = sum(valid_related_rates) / len(valid_related_rates)
|
| 406 |
+
|
| 407 |
+
if avg_related_rate == 0:
|
| 408 |
+
return 0.0
|
| 409 |
+
|
| 410 |
+
return current_learning_rate / avg_related_rate
|
| 411 |
+
|
| 412 |
+
def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]:
|
| 413 |
+
"""Calculate the learning rate for a trajectory."""
|
| 414 |
+
scores = trajectory.get_score_series()
|
| 415 |
+
if len(scores) < 2:
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
# Calculate improvement per iteration
|
| 419 |
+
return (scores[-1] - scores[0]) / (len(scores) - 1)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
class DynamicComplexityHandling(RecursiveMetric):
|
| 423 |
+
"""
|
| 424 |
+
Measures how well the model handles varying problem complexity.
|
| 425 |
+
|
| 426 |
+
This metric evaluates performance while accounting for changes in problem difficulty.
|
| 427 |
+
"""
|
| 428 |
+
|
| 429 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 430 |
+
if not trajectory.steps:
|
| 431 |
+
return 0.0
|
| 432 |
+
|
| 433 |
+
# Extract scores and difficulties
|
| 434 |
+
scores = trajectory.get_score_series()
|
| 435 |
+
difficulties = [step.problem_state.difficulty for step in trajectory.steps]
|
| 436 |
+
|
| 437 |
+
if len(scores) < 2:
|
| 438 |
+
return scores[0] # Return the single score if only one step
|
| 439 |
+
|
| 440 |
+
# Calculate normalized scores (adjusted by difficulty)
|
| 441 |
+
normalized_scores = [scores[i] * (1 + difficulties[i])
|
| 442 |
+
for i in range(len(scores))]
|
| 443 |
+
|
| 444 |
+
# Return the average normalized score
|
| 445 |
+
return sum(normalized_scores) / len(normalized_scores)
|
models/anthropic.py
ADDED
|
@@ -0,0 +1,866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# recursive_swe_bench/models/anthropic.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import backoff
|
| 5 |
+
import time
|
| 6 |
+
import anthropic
|
| 7 |
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
| 8 |
+
import re
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
from recursive_swe_bench.models.base_model import ModelInterface
|
| 12 |
+
|
| 13 |
+
class AnthropicModel(ModelInterface):
|
| 14 |
+
"""
|
| 15 |
+
Integration with Anthropic models (Claude).
|
| 16 |
+
|
| 17 |
+
This class provides integration with Anthropic's API for evaluating
|
| 18 |
+
Claude models with Recursive-SWE-bench through recursive evaluation loops.
|
| 19 |
+
The implementation features dynamic adaptation to feedback through a
|
| 20 |
+
self-reflective mechanism that traces attribution paths through recursive iterations.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
model_identifier: str,
|
| 26 |
+
api_key: Optional[str] = None,
|
| 27 |
+
config: Optional[Dict[str, Any]] = None
|
| 28 |
+
):
|
| 29 |
+
"""
|
| 30 |
+
Initialize the Anthropic model interface.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
model_identifier: Anthropic model identifier (e.g., "claude-3-opus-20240229")
|
| 34 |
+
api_key: Anthropic API key (optional if set in environment)
|
| 35 |
+
config: Additional configuration options
|
| 36 |
+
"""
|
| 37 |
+
super().__init__(model_identifier, config)
|
| 38 |
+
|
| 39 |
+
# Initialize Anthropic client
|
| 40 |
+
if api_key:
|
| 41 |
+
self.client = anthropic.Anthropic(api_key=api_key)
|
| 42 |
+
else:
|
| 43 |
+
self.client = anthropic.Anthropic()
|
| 44 |
+
|
| 45 |
+
# Set up system prompt and templates
|
| 46 |
+
self.prompts = self.config.get("prompts", {
|
| 47 |
+
"system": "You are an expert software engineer who specializes in debugging and fixing complex code. Your task is to fix bugs in code based on the description and test requirements provided.",
|
| 48 |
+
"user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Focus on making the code pass all tests while maintaining good practices. Provide only the corrected code without additional explanations.",
|
| 49 |
+
"reflection_template": "# Feedback on Previous Solution\n\nYour previous solution had the following issues:\n{issues}\n\n# Suggested Improvements\n{suggestions}\n\n# Test Results\n{test_results}\n\n# Reflection Prompt\nBefore providing a new solution, analyze what went wrong in your previous attempt and how you'll approach fixing it differently this time."
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
# Configure API parameters
|
| 53 |
+
self.api_params = self.config.get("api_params", {
|
| 54 |
+
"temperature": 0.2,
|
| 55 |
+
"max_tokens": 2000,
|
| 56 |
+
"top_p": 0.95,
|
| 57 |
+
"top_k": 50
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
+
# Set up recursive adaptation configuration
|
| 61 |
+
self.recursive_config = self.config.get("recursive_config", {
|
| 62 |
+
"enable_self_reflection": True,
|
| 63 |
+
"adaptation_threshold": 0.5, # Minimum score to trigger adaptation
|
| 64 |
+
"max_reflection_depth": 3, # Maximum depth of recursive reflection
|
| 65 |
+
"attribution_tracking": True, # Track attribution patterns across iterations
|
| 66 |
+
"dynamic_prompting": True, # Adjust prompts based on failure patterns
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
# Initialize recursive state
|
| 70 |
+
self.recursive_state = {
|
| 71 |
+
"reflection_depth": 0,
|
| 72 |
+
"adaptation_vector": [0.0] * 5, # Tracks adaptation across dimensions
|
| 73 |
+
"attribution_map": {}, # Maps error types to attribution patterns
|
| 74 |
+
"error_frequency": {}, # Tracks frequency of error types
|
| 75 |
+
"solution_quality_trend": [], # Tracks solution quality over iterations
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
self.logger.info(f"Initialized Anthropic model: {model_identifier} with recursive capability")
|
| 79 |
+
|
| 80 |
+
@backoff.on_exception(
|
| 81 |
+
backoff.expo,
|
| 82 |
+
(anthropic.APIError, anthropic.APITimeoutError, anthropic.RateLimitError),
|
| 83 |
+
max_tries=5
|
| 84 |
+
)
|
| 85 |
+
def solve(
|
| 86 |
+
self,
|
| 87 |
+
problem: Dict[str, Any],
|
| 88 |
+
history: Optional[List[Dict[str, Any]]] = None
|
| 89 |
+
) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Generate a solution using the Anthropic model with recursive adaptation.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
problem: The problem to solve
|
| 95 |
+
history: Optional history of previous solution attempts
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
The generated solution
|
| 99 |
+
"""
|
| 100 |
+
self.logger.info(f"Solving problem with Anthropic model: {self.model_identifier}")
|
| 101 |
+
start_time = time.time()
|
| 102 |
+
|
| 103 |
+
# Reset recursive state for new problems if no history
|
| 104 |
+
if not history:
|
| 105 |
+
self._reset_recursive_state()
|
| 106 |
+
elif history:
|
| 107 |
+
# Update recursive state based on history
|
| 108 |
+
self._update_recursive_state(history)
|
| 109 |
+
|
| 110 |
+
# Format messages for the model
|
| 111 |
+
system_prompt, user_message = self._format_messages(problem, history)
|
| 112 |
+
|
| 113 |
+
# Make API call
|
| 114 |
+
response = self.client.messages.create(
|
| 115 |
+
model=self.model_identifier,
|
| 116 |
+
system=system_prompt,
|
| 117 |
+
messages=[
|
| 118 |
+
{"role": "user", "content": user_message}
|
| 119 |
+
],
|
| 120 |
+
max_tokens=self.api_params.get("max_tokens", 2000),
|
| 121 |
+
temperature=self.api_params.get("temperature", 0.2),
|
| 122 |
+
top_p=self.api_params.get("top_p", 0.95),
|
| 123 |
+
top_k=self.api_params.get("top_k", 50)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Extract the solution from the response
|
| 127 |
+
solution = response.content[0].text
|
| 128 |
+
|
| 129 |
+
end_time = time.time()
|
| 130 |
+
self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds")
|
| 131 |
+
|
| 132 |
+
# Track solution in recursive state
|
| 133 |
+
if solution:
|
| 134 |
+
self.recursive_state["reflection_depth"] += 1
|
| 135 |
+
|
| 136 |
+
return self._extract_code(solution)
|
| 137 |
+
|
| 138 |
+
def _format_messages(
|
| 139 |
+
self,
|
| 140 |
+
problem: Dict[str, Any],
|
| 141 |
+
history: Optional[List[Dict[str, Any]]] = None
|
| 142 |
+
) -> Tuple[str, str]:
|
| 143 |
+
"""
|
| 144 |
+
Format the problem and history into messages for the Anthropic API.
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
problem: The problem to solve
|
| 148 |
+
history: Optional history of previous solution attempts
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Tuple of (system_prompt, user_message)
|
| 152 |
+
"""
|
| 153 |
+
# Start with base system prompt
|
| 154 |
+
system_prompt = self.prompts["system"]
|
| 155 |
+
|
| 156 |
+
# Enhance system prompt with recursive adaptation if enabled
|
| 157 |
+
if self.recursive_config.get("enable_self_reflection", True) and history:
|
| 158 |
+
# Add adaptation guidance based on error patterns
|
| 159 |
+
if self.recursive_state["error_frequency"]:
|
| 160 |
+
top_errors = sorted(
|
| 161 |
+
self.recursive_state["error_frequency"].items(),
|
| 162 |
+
key=lambda x: x[1],
|
| 163 |
+
reverse=True
|
| 164 |
+
)[:3]
|
| 165 |
+
|
| 166 |
+
error_guidance = "Focus particularly on addressing these recurring issues:\n"
|
| 167 |
+
for error_type, count in top_errors:
|
| 168 |
+
error_guidance += f"- {error_type} (appeared {count} times)\n"
|
| 169 |
+
|
| 170 |
+
system_prompt += f"\n\n{error_guidance}"
|
| 171 |
+
|
| 172 |
+
# Add reflection guidance based on solution quality trend
|
| 173 |
+
if len(self.recursive_state["solution_quality_trend"]) > 1:
|
| 174 |
+
trend = self.recursive_state["solution_quality_trend"]
|
| 175 |
+
if trend[-1] > trend[-2]:
|
| 176 |
+
system_prompt += "\n\nYour solutions are improving. Continue this trajectory."
|
| 177 |
+
elif trend[-1] < trend[-2]:
|
| 178 |
+
system_prompt += "\n\nYour solutions are declining in quality. Carefully reconsider your approach."
|
| 179 |
+
else:
|
| 180 |
+
system_prompt += "\n\nYour solutions maintain the same quality. Try a different approach."
|
| 181 |
+
|
| 182 |
+
# Format code and tests
|
| 183 |
+
code = problem["code_context"]["code"]
|
| 184 |
+
|
| 185 |
+
# Prepare tests description
|
| 186 |
+
tests_description = "# Tests\n"
|
| 187 |
+
if "tests" in problem["code_context"]:
|
| 188 |
+
tests_description += "The code must pass the following tests:\n\n"
|
| 189 |
+
for i, test in enumerate(problem["code_context"]["tests"]):
|
| 190 |
+
tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n"
|
| 191 |
+
else:
|
| 192 |
+
tests_description += "The code must work correctly according to its intended functionality."
|
| 193 |
+
|
| 194 |
+
# Base user message
|
| 195 |
+
user_message = self.prompts["user_template"].format(
|
| 196 |
+
description=problem["description"],
|
| 197 |
+
code=code,
|
| 198 |
+
tests_description=tests_description
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# Add history if available - with recursive reflection
|
| 202 |
+
if history and self.recursive_config.get("enable_self_reflection", True):
|
| 203 |
+
# Get the most recent entry for reflection
|
| 204 |
+
latest_entry = history[-1]
|
| 205 |
+
|
| 206 |
+
# Format issues
|
| 207 |
+
issues_text = "- " + "\n- ".join([issue["message"] for issue in latest_entry["feedback"]["issues"]])
|
| 208 |
+
|
| 209 |
+
# Format suggestions
|
| 210 |
+
suggestions_text = "- " + "\n- ".join([suggestion["message"] for suggestion in latest_entry["feedback"]["suggestions"]])
|
| 211 |
+
|
| 212 |
+
# Format test results
|
| 213 |
+
test_results = latest_entry.get("result", {})
|
| 214 |
+
passed_tests = test_results.get("passed_tests", 0)
|
| 215 |
+
total_tests = test_results.get("total_tests", 0)
|
| 216 |
+
|
| 217 |
+
test_results_text = f"Passed {passed_tests}/{total_tests} tests."
|
| 218 |
+
if "tests" in test_results:
|
| 219 |
+
test_results_text += "\n\nIndividual test results:"
|
| 220 |
+
for test_name, test_result in test_results["tests"].items():
|
| 221 |
+
status = "✅ Passed" if test_result.get("passed", False) else "❌ Failed"
|
| 222 |
+
test_results_text += f"\n- {test_name}: {status}"
|
| 223 |
+
if not test_result.get("passed", False) and "message" in test_result:
|
| 224 |
+
test_results_text += f"\n Error: {test_result['message']}"
|
| 225 |
+
|
| 226 |
+
# Add reflection prompt
|
| 227 |
+
reflection_prompt = self.prompts["reflection_template"].format(
|
| 228 |
+
issues=issues_text,
|
| 229 |
+
suggestions=suggestions_text,
|
| 230 |
+
test_results=test_results_text
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Prepend reflection to user message
|
| 234 |
+
user_message = f"{reflection_prompt}\n\n{user_message}"
|
| 235 |
+
|
| 236 |
+
# Add dynamic adaptation based on error patterns if enabled
|
| 237 |
+
if self.recursive_config.get("dynamic_prompting", True):
|
| 238 |
+
# Look for specific error patterns and add targeted guidance
|
| 239 |
+
error_types = [issue.get("type", "") for issue in latest_entry["feedback"]["issues"]]
|
| 240 |
+
|
| 241 |
+
if "syntax" in " ".join(error_types).lower():
|
| 242 |
+
user_message += "\n\nPay careful attention to syntax correctness. Double-check all parentheses, indentation, and function definitions."
|
| 243 |
+
|
| 244 |
+
if "test_failure" in " ".join(error_types).lower():
|
| 245 |
+
user_message += "\n\nFocus on making the code pass the failing tests. Carefully trace through the code execution for each test case."
|
| 246 |
+
|
| 247 |
+
if "edge_case" in " ".join(error_types).lower() or "boundary" in " ".join(error_types).lower():
|
| 248 |
+
user_message += "\n\nBe sure to handle edge cases such as empty inputs, boundary values, and special cases."
|
| 249 |
+
|
| 250 |
+
if "performance" in " ".join(error_types).lower():
|
| 251 |
+
user_message += "\n\nOptimize your solution for better performance. Avoid unnecessary operations and inefficient data structures."
|
| 252 |
+
|
| 253 |
+
return system_prompt, user_message
|
| 254 |
+
|
| 255 |
+
def _extract_code(self, text: str) -> str:
|
| 256 |
+
"""
|
| 257 |
+
Extract code from the model's response.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
text: The model's response
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Extracted code
|
| 264 |
+
"""
|
| 265 |
+
# Try to extract code from markdown code blocks
|
| 266 |
+
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL)
|
| 267 |
+
|
| 268 |
+
if code_blocks:
|
| 269 |
+
return code_blocks[0].strip()
|
| 270 |
+
|
| 271 |
+
# If no code blocks, return the full text (it might be just code)
|
| 272 |
+
return text.strip()
|
| 273 |
+
|
| 274 |
+
def _reset_recursive_state(self):
|
| 275 |
+
"""Reset the recursive state for a new problem."""
|
| 276 |
+
self.recursive_state = {
|
| 277 |
+
"reflection_depth": 0,
|
| 278 |
+
"adaptation_vector": [0.0] * 5,
|
| 279 |
+
"attribution_map": {},
|
| 280 |
+
"error_frequency": {},
|
| 281 |
+
"solution_quality_trend": [],
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
def _update_recursive_state(self, history: List[Dict[str, Any]]):
|
| 285 |
+
"""
|
| 286 |
+
Update recursive state based on solution history.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
history: History of previous solution attempts
|
| 290 |
+
"""
|
| 291 |
+
# Extract scores from history
|
| 292 |
+
scores = [entry.get("result", {}).get("score", 0.0) for entry in history]
|
| 293 |
+
self.recursive_state["solution_quality_trend"] = scores
|
| 294 |
+
|
| 295 |
+
# Calculate adaptation vector
|
| 296 |
+
if len(scores) >= 2:
|
| 297 |
+
# Dimension 0: Overall improvement trajectory
|
| 298 |
+
improvement = scores[-1] - scores[0]
|
| 299 |
+
self.recursive_state["adaptation_vector"][0] = max(-1.0, min(1.0, improvement))
|
| 300 |
+
|
| 301 |
+
# Dimension 1: Recent improvement
|
| 302 |
+
recent_improvement = scores[-1] - scores[-2]
|
| 303 |
+
self.recursive_state["adaptation_vector"][1] = max(-1.0, min(1.0, recent_improvement))
|
| 304 |
+
|
| 305 |
+
# Update error frequency from latest feedback
|
| 306 |
+
if history:
|
| 307 |
+
latest_feedback = history[-1].get("feedback", {})
|
| 308 |
+
issues = latest_feedback.get("issues", [])
|
| 309 |
+
|
| 310 |
+
for issue in issues:
|
| 311 |
+
issue_type = issue.get("type", "unknown")
|
| 312 |
+
self.recursive_state["error_frequency"][issue_type] = self.recursive_state["error_frequency"].get(issue_type, 0) + 1
|
| 313 |
+
|
| 314 |
+
# Update reflection depth
|
| 315 |
+
self.recursive_state["reflection_depth"] = len(history)
|
| 316 |
+
|
| 317 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
| 318 |
+
"""
|
| 319 |
+
Get meta information about the model.
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
Dictionary containing model information
|
| 323 |
+
"""
|
| 324 |
+
return {
|
| 325 |
+
"model_name": self.model_identifier,
|
| 326 |
+
"provider": "Anthropic",
|
| 327 |
+
"type": "API",
|
| 328 |
+
"parameters": self.api_params,
|
| 329 |
+
"system_prompt": self.prompts["system"],
|
| 330 |
+
"recursive_capability": self.recursive_config.get("enable_self_reflection", True),
|
| 331 |
+
"reflection_depth": self.recursive_state["reflection_depth"],
|
| 332 |
+
"adaptation_vector": self.recursive_state["adaptation_vector"]
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
# recursive_swe_bench/evaluation/recursive_metrics.py
|
| 337 |
+
|
| 338 |
+
import numpy as np
|
| 339 |
+
import scipy.stats
|
| 340 |
+
from typing import Any, Dict, List, Optional, Union
|
| 341 |
+
import dataclasses
|
| 342 |
+
import math
|
| 343 |
+
|
| 344 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
class RecursiveLearningCurveArea:
|
| 348 |
+
"""
|
| 349 |
+
Measures the area under the learning curve across iterations.
|
| 350 |
+
|
| 351 |
+
This metric captures the overall performance of a model throughout its
|
| 352 |
+
learning trajectory, rewarding both high scores and quick improvement.
|
| 353 |
+
"""
|
| 354 |
+
|
| 355 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 356 |
+
"""
|
| 357 |
+
Initialize the recursive learning curve area metric.
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
config: Configuration options
|
| 361 |
+
"""
|
| 362 |
+
self.config = config or {}
|
| 363 |
+
self.max_score = self.config.get("max_score", 1.0)
|
| 364 |
+
self.normalize = self.config.get("normalize", True)
|
| 365 |
+
|
| 366 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 367 |
+
"""
|
| 368 |
+
Calculate the area under the learning curve.
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
trajectory: The solution trajectory
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
The normalized area under the learning curve
|
| 375 |
+
"""
|
| 376 |
+
scores = trajectory.get_score_series()
|
| 377 |
+
if not scores:
|
| 378 |
+
return 0.0
|
| 379 |
+
|
| 380 |
+
# Calculate the area under the curve using trapezoidal rule
|
| 381 |
+
area = np.trapz(scores, dx=1.0)
|
| 382 |
+
|
| 383 |
+
# Normalize by the maximum possible area if requested
|
| 384 |
+
if self.normalize:
|
| 385 |
+
max_area = self.max_score * len(scores)
|
| 386 |
+
return area / max_area
|
| 387 |
+
|
| 388 |
+
return area
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
class AdaptationRate:
|
| 392 |
+
"""
|
| 393 |
+
Measures the rate at which the model improves its solutions.
|
| 394 |
+
|
| 395 |
+
This metric captures how quickly a model adapts to feedback and
|
| 396 |
+
improves its solutions across iterations.
|
| 397 |
+
"""
|
| 398 |
+
|
| 399 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 400 |
+
"""
|
| 401 |
+
Initialize the adaptation rate metric.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
config: Configuration options
|
| 405 |
+
"""
|
| 406 |
+
self.config = config or {}
|
| 407 |
+
self.min_iterations = self.config.get("min_iterations", 2)
|
| 408 |
+
|
| 409 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 410 |
+
"""
|
| 411 |
+
Calculate the adaptation rate.
|
| 412 |
+
|
| 413 |
+
Args:
|
| 414 |
+
trajectory: The solution trajectory
|
| 415 |
+
|
| 416 |
+
Returns:
|
| 417 |
+
The adaptation rate
|
| 418 |
+
"""
|
| 419 |
+
scores = trajectory.get_score_series()
|
| 420 |
+
if len(scores) < self.min_iterations:
|
| 421 |
+
return 0.0
|
| 422 |
+
|
| 423 |
+
# Calculate the average improvement per iteration
|
| 424 |
+
total_improvement = scores[-1] - scores[0]
|
| 425 |
+
iterations = len(scores) - 1
|
| 426 |
+
|
| 427 |
+
return total_improvement / iterations
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
class RecursiveVolatility:
|
| 431 |
+
"""
|
| 432 |
+
Measures the volatility of solution quality across iterations.
|
| 433 |
+
|
| 434 |
+
This metric captures how stable or erratic a model's performance
|
| 435 |
+
is across iterations.
|
| 436 |
+
"""
|
| 437 |
+
|
| 438 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 439 |
+
"""
|
| 440 |
+
Initialize the recursive volatility metric.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
config: Configuration options
|
| 444 |
+
"""
|
| 445 |
+
self.config = config or {}
|
| 446 |
+
self.min_iterations = self.config.get("min_iterations", 3)
|
| 447 |
+
self.normalize = self.config.get("normalize", True)
|
| 448 |
+
|
| 449 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 450 |
+
"""
|
| 451 |
+
Calculate the recursive volatility.
|
| 452 |
+
|
| 453 |
+
Args:
|
| 454 |
+
trajectory: The solution trajectory
|
| 455 |
+
|
| 456 |
+
Returns:
|
| 457 |
+
The normalized volatility
|
| 458 |
+
"""
|
| 459 |
+
scores = trajectory.get_score_series()
|
| 460 |
+
if len(scores) < self.min_iterations:
|
| 461 |
+
return 0.0
|
| 462 |
+
|
| 463 |
+
# Calculate the standard deviation of score changes
|
| 464 |
+
changes = [abs(scores[i] - scores[i-1]) for i in range(1, len(scores))]
|
| 465 |
+
volatility = np.std(changes)
|
| 466 |
+
|
| 467 |
+
# Normalize by the average score if requested
|
| 468 |
+
if self.normalize and np.mean(scores) > 0:
|
| 469 |
+
return volatility / np.mean(scores)
|
| 470 |
+
|
| 471 |
+
return volatility
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
class ConvergenceIndex:
|
| 475 |
+
"""
|
| 476 |
+
Measures how quickly the model converges to a stable solution.
|
| 477 |
+
|
| 478 |
+
This metric captures how efficiently a model reaches a stable solution
|
| 479 |
+
across iterations.
|
| 480 |
+
"""
|
| 481 |
+
|
| 482 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 483 |
+
"""
|
| 484 |
+
Initialize the convergence index metric.
|
| 485 |
+
|
| 486 |
+
Args:
|
| 487 |
+
config: Configuration options
|
| 488 |
+
"""
|
| 489 |
+
self.config = config or {}
|
| 490 |
+
self.stability_threshold = self.config.get("stability_threshold", 0.05)
|
| 491 |
+
self.max_score_threshold = self.config.get("max_score_threshold", 0.95)
|
| 492 |
+
|
| 493 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 494 |
+
"""
|
| 495 |
+
Calculate the convergence index.
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
trajectory: The solution trajectory
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
The convergence index (lower is better)
|
| 502 |
+
"""
|
| 503 |
+
scores = trajectory.get_score_series()
|
| 504 |
+
if not scores:
|
| 505 |
+
return 0.0
|
| 506 |
+
|
| 507 |
+
# Find the first iteration where the score stabilizes
|
| 508 |
+
# (subsequent changes are below the stability threshold)
|
| 509 |
+
convergence_point = len(scores) - 1
|
| 510 |
+
for i in range(1, len(scores)):
|
| 511 |
+
remaining_changes = [abs(scores[j] - scores[j-1]) for j in range(i, len(scores))]
|
| 512 |
+
if all(change <= self.stability_threshold for change in remaining_changes):
|
| 513 |
+
convergence_point = i
|
| 514 |
+
break
|
| 515 |
+
|
| 516 |
+
# Find the first iteration where the score exceeds the max score threshold
|
| 517 |
+
max_score_point = len(scores)
|
| 518 |
+
for i, score in enumerate(scores):
|
| 519 |
+
if score >= self.max_score_threshold:
|
| 520 |
+
max_score_point = i
|
| 521 |
+
break
|
| 522 |
+
|
| 523 |
+
# Return a combined index
|
| 524 |
+
# Lower is better - converging quickly to a high score is ideal
|
| 525 |
+
return (convergence_point / len(scores)) * (1.0 - max(0.0, min(1.0, scores[-1])))
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
class ErrorRecoveryEfficiency:
|
| 529 |
+
"""
|
| 530 |
+
Measures how efficiently the model recovers from errors.
|
| 531 |
+
|
| 532 |
+
This metric captures how well a model addresses and fixes specific
|
| 533 |
+
errors across iterations.
|
| 534 |
+
"""
|
| 535 |
+
|
| 536 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 537 |
+
"""
|
| 538 |
+
Initialize the error recovery efficiency metric.
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
config: Configuration options
|
| 542 |
+
"""
|
| 543 |
+
self.config = config or {}
|
| 544 |
+
|
| 545 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 546 |
+
"""
|
| 547 |
+
Calculate the error recovery efficiency.
|
| 548 |
+
|
| 549 |
+
Args:
|
| 550 |
+
trajectory: The solution trajectory
|
| 551 |
+
|
| 552 |
+
Returns:
|
| 553 |
+
The error recovery efficiency
|
| 554 |
+
"""
|
| 555 |
+
if not trajectory.steps or len(trajectory.steps) < 2:
|
| 556 |
+
return 0.0
|
| 557 |
+
|
| 558 |
+
# Extract error counts from each step
|
| 559 |
+
error_counts = []
|
| 560 |
+
for step in trajectory.steps:
|
| 561 |
+
if hasattr(step, "result") and hasattr(step.result, "error_details"):
|
| 562 |
+
error_counts.append(len(step.result.error_details or {}))
|
| 563 |
+
else:
|
| 564 |
+
# If no error details available, use issue count from feedback
|
| 565 |
+
error_counts.append(len(step.feedback.issues))
|
| 566 |
+
|
| 567 |
+
if not error_counts or error_counts[0] == 0:
|
| 568 |
+
return 1.0 # Perfect if no initial errors
|
| 569 |
+
|
| 570 |
+
# Calculate the rate at which errors are fixed
|
| 571 |
+
initial_errors = error_counts[0]
|
| 572 |
+
final_errors = error_counts[-1]
|
| 573 |
+
|
| 574 |
+
# Return the proportion of errors fixed
|
| 575 |
+
return (initial_errors - final_errors) / initial_errors
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
class DynamicComplexityHandling:
|
| 579 |
+
"""
|
| 580 |
+
Measures how well the model handles varying problem complexity.
|
| 581 |
+
|
| 582 |
+
This metric evaluates performance while accounting for changes in
|
| 583 |
+
problem difficulty across iterations.
|
| 584 |
+
"""
|
| 585 |
+
|
| 586 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 587 |
+
"""
|
| 588 |
+
Initialize the dynamic complexity handling metric.
|
| 589 |
+
|
| 590 |
+
Args:
|
| 591 |
+
config: Configuration options
|
| 592 |
+
"""
|
| 593 |
+
self.config = config or {}
|
| 594 |
+
|
| 595 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
| 596 |
+
"""
|
| 597 |
+
Calculate the dynamic complexity handling score.
|
| 598 |
+
|
| 599 |
+
Args:
|
| 600 |
+
trajectory: The solution trajectory
|
| 601 |
+
|
| 602 |
+
Returns:
|
| 603 |
+
The dynamic complexity handling score
|
| 604 |
+
"""
|
| 605 |
+
if not trajectory.steps:
|
| 606 |
+
return 0.0
|
| 607 |
+
|
| 608 |
+
# Extract scores and difficulties from each step
|
| 609 |
+
scores = []
|
| 610 |
+
difficulties = []
|
| 611 |
+
|
| 612 |
+
for step in trajectory.steps:
|
| 613 |
+
scores.append(step.result.score)
|
| 614 |
+
difficulties.append(step.problem_state.difficulty)
|
| 615 |
+
|
| 616 |
+
# Calculate difficulty-weighted scores
|
| 617 |
+
weighted_scores = [scores[i] / max(0.1, difficulties[i]) for i in range(len(scores))]
|
| 618 |
+
|
| 619 |
+
# Return the average weighted score
|
| 620 |
+
return sum(weighted_scores) / len(weighted_scores)
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
class RecursiveFrameworkMetrics:
|
| 624 |
+
"""
|
| 625 |
+
Comprehensive collection of metrics for recursive evaluation.
|
| 626 |
+
|
| 627 |
+
This class provides easy access to all recursive metrics and
|
| 628 |
+
standardized calculation across trajectories.
|
| 629 |
+
"""
|
| 630 |
+
|
| 631 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 632 |
+
"""
|
| 633 |
+
Initialize the recursive framework metrics.
|
| 634 |
+
|
| 635 |
+
Args:
|
| 636 |
+
config: Configuration options
|
| 637 |
+
"""
|
| 638 |
+
self.config = config or {}
|
| 639 |
+
|
| 640 |
+
# Initialize all metrics
|
| 641 |
+
self.metrics = {
|
| 642 |
+
"learning_curve_area": RecursiveLearningCurveArea(self.config.get("learning_curve_area")),
|
| 643 |
+
"adaptation_rate": AdaptationRate(self.config.get("adaptation_rate")),
|
| 644 |
+
"volatility": RecursiveVolatility(self.config.get("volatility")),
|
| 645 |
+
"convergence_index": ConvergenceIndex(self.config.get("convergence_index")),
|
| 646 |
+
"error_recovery": ErrorRecoveryEfficiency(self.config.get("error_recovery")),
|
| 647 |
+
"complexity_handling": DynamicComplexityHandling(self.config.get("complexity_handling"))
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
# Add custom metrics from config if provided
|
| 651 |
+
if "custom_metrics" in self.config:
|
| 652 |
+
for name, metric in self.config["custom_metrics"].items():
|
| 653 |
+
self.metrics[name] = metric
|
| 654 |
+
|
| 655 |
+
def calculate_all(self, trajectory: Trajectory) -> Dict[str, float]:
|
| 656 |
+
"""
|
| 657 |
+
Calculate all metrics for a trajectory.
|
| 658 |
+
|
| 659 |
+
Args:
|
| 660 |
+
trajectory: The solution trajectory
|
| 661 |
+
|
| 662 |
+
Returns:
|
| 663 |
+
Dictionary of metric names and values
|
| 664 |
+
"""
|
| 665 |
+
return {name: metric.calculate(trajectory)
|
| 666 |
+
for name, metric in self.metrics.items()}
|
| 667 |
+
|
| 668 |
+
def calculate(self, trajectory: Trajectory, metric_name: str) -> float:
|
| 669 |
+
"""
|
| 670 |
+
Calculate a specific metric for a trajectory.
|
| 671 |
+
|
| 672 |
+
Args:
|
| 673 |
+
trajectory: The solution trajectory
|
| 674 |
+
metric_name: The name of the metric to calculate
|
| 675 |
+
|
| 676 |
+
Returns:
|
| 677 |
+
The calculated metric value
|
| 678 |
+
"""
|
| 679 |
+
if metric_name not in self.metrics:
|
| 680 |
+
raise ValueError(f"Unknown metric: {metric_name}")
|
| 681 |
+
|
| 682 |
+
return self.metrics[metric_name].calculate(trajectory)
|
| 683 |
+
|
| 684 |
+
def aggregate_metrics(self, trajectories: List[Trajectory]) -> Dict[str, float]:
|
| 685 |
+
"""
|
| 686 |
+
Calculate aggregate metrics across multiple trajectories.
|
| 687 |
+
|
| 688 |
+
Args:
|
| 689 |
+
trajectories: List of solution trajectories
|
| 690 |
+
|
| 691 |
+
Returns:
|
| 692 |
+
Dictionary of aggregated metric values
|
| 693 |
+
"""
|
| 694 |
+
if not trajectories:
|
| 695 |
+
return {}
|
| 696 |
+
|
| 697 |
+
all_metrics = [self.calculate_all(trajectory) for trajectory in trajectories]
|
| 698 |
+
|
| 699 |
+
# Aggregate by averaging each metric
|
| 700 |
+
aggregated = {}
|
| 701 |
+
for metric_name in self.metrics:
|
| 702 |
+
values = [metrics[metric_name] for metrics in all_metrics]
|
| 703 |
+
aggregated[metric_name] = sum(values) / len(values)
|
| 704 |
+
|
| 705 |
+
return aggregated
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
# recursive_swe_bench/evaluation/visualizer.py
|
| 709 |
+
|
| 710 |
+
import matplotlib.pyplot as plt
|
| 711 |
+
import numpy as np
|
| 712 |
+
import pandas as pd
|
| 713 |
+
from typing import Any, Dict, List, Optional, Union
|
| 714 |
+
import os
|
| 715 |
+
import json
|
| 716 |
+
import seaborn as sns
|
| 717 |
+
from pathlib import Path
|
| 718 |
+
|
| 719 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
class RecursiveVisualizer:
|
| 723 |
+
"""
|
| 724 |
+
Visualization tools for recursive evaluation results.
|
| 725 |
+
|
| 726 |
+
This class provides methods for visualizing recursive trajectories,
|
| 727 |
+
metrics, and comparative analysis across models.
|
| 728 |
+
"""
|
| 729 |
+
|
| 730 |
+
def __init__(self, output_dir: Optional[str] = None, config: Dict[str, Any] = None):
|
| 731 |
+
"""
|
| 732 |
+
Initialize the recursive visualizer.
|
| 733 |
+
|
| 734 |
+
Args:
|
| 735 |
+
output_dir: Directory to save visualizations
|
| 736 |
+
config: Configuration options
|
| 737 |
+
"""
|
| 738 |
+
self.output_dir = output_dir
|
| 739 |
+
if output_dir:
|
| 740 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 741 |
+
|
| 742 |
+
self.config = config or {}
|
| 743 |
+
self.theme = self.config.get("theme", "default")
|
| 744 |
+
|
| 745 |
+
# Set up the visualization style
|
| 746 |
+
if self.theme == "dark":
|
| 747 |
+
plt.style.use("dark_background")
|
| 748 |
+
self.colors = sns.color_palette("viridis", 10)
|
| 749 |
+
else:
|
| 750 |
+
plt.style.use("seaborn-v0_8-whitegrid")
|
| 751 |
+
self.colors = sns.color_palette("muted", 10)
|
| 752 |
+
|
| 753 |
+
sns.set_context("talk")
|
| 754 |
+
|
| 755 |
+
def plot_trajectory(
|
| 756 |
+
self,
|
| 757 |
+
trajectory: Trajectory,
|
| 758 |
+
title: Optional[str] = None,
|
| 759 |
+
show: bool = True,
|
| 760 |
+
save_path: Optional[str] = None
|
| 761 |
+
):
|
| 762 |
+
"""
|
| 763 |
+
Plot a solution trajectory showing score evolution.
|
| 764 |
+
|
| 765 |
+
Args:
|
| 766 |
+
trajectory: The solution trajectory
|
| 767 |
+
title: Optional title for the plot
|
| 768 |
+
show: Whether to display the plot
|
| 769 |
+
save_path: Optional path to save the plot
|
| 770 |
+
"""
|
| 771 |
+
scores = trajectory.get_score_series()
|
| 772 |
+
if not scores:
|
| 773 |
+
return
|
| 774 |
+
|
| 775 |
+
plt.figure(figsize=(10, 6))
|
| 776 |
+
|
| 777 |
+
# Plot scores
|
| 778 |
+
plt.plot(range(1, len(scores) + 1), scores, marker='o',
|
| 779 |
+
linewidth=2, markersize=8, color=self.colors[0])
|
| 780 |
+
|
| 781 |
+
# Add difficulty if available
|
| 782 |
+
difficulties = [step.problem_state.difficulty for step in trajectory.steps]
|
| 783 |
+
if difficulties:
|
| 784 |
+
plt.plot(range(1, len(difficulties) + 1), difficulties, marker='s',
|
| 785 |
+
linewidth=2, markersize=8, color=self.colors[1], linestyle='--',
|
| 786 |
+
label='Problem Difficulty')
|
| 787 |
+
|
| 788 |
+
# Set plot properties
|
| 789 |
+
plt.title(title or f"Solution Trajectory for Task {trajectory.task_id}")
|
| 790 |
+
plt.xlabel("Iteration")
|
| 791 |
+
plt.ylabel("Score / Difficulty")
|
| 792 |
+
plt.grid(True)
|
| 793 |
+
plt.ylim(0, 1.05)
|
| 794 |
+
plt.xticks(range(1, len(scores) + 1))
|
| 795 |
+
plt.legend(["Solution Score", "Problem Difficulty"])
|
| 796 |
+
|
| 797 |
+
# Save if requested
|
| 798 |
+
if save_path:
|
| 799 |
+
full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path
|
| 800 |
+
plt.savefig(full_path, bbox_inches='tight', dpi=300)
|
| 801 |
+
|
| 802 |
+
# Show if requested
|
| 803 |
+
if show:
|
| 804 |
+
plt.show()
|
| 805 |
+
else:
|
| 806 |
+
plt.close()
|
| 807 |
+
|
| 808 |
+
def plot_metrics_comparison(
|
| 809 |
+
self,
|
| 810 |
+
metrics_by_model: Dict[str, Dict[str, float]],
|
| 811 |
+
title: Optional[str] = None,
|
| 812 |
+
show: bool = True,
|
| 813 |
+
save_path: Optional[str] = None
|
| 814 |
+
):
|
| 815 |
+
"""
|
| 816 |
+
Plot a comparison of metrics across models.
|
| 817 |
+
|
| 818 |
+
Args:
|
| 819 |
+
metrics_by_model: Dictionary mapping model names to metric values
|
| 820 |
+
title: Optional title for the plot
|
| 821 |
+
show: Whether to display the plot
|
| 822 |
+
save_path: Optional path to save the plot
|
| 823 |
+
"""
|
| 824 |
+
if not metrics_by_model:
|
| 825 |
+
return
|
| 826 |
+
|
| 827 |
+
# Convert to DataFrame for easier plotting
|
| 828 |
+
df = pd.DataFrame(metrics_by_model).T
|
| 829 |
+
|
| 830 |
+
# Create a radar chart
|
| 831 |
+
categories = list(df.columns)
|
| 832 |
+
N = len(categories)
|
| 833 |
+
|
| 834 |
+
# Create angles for each metric
|
| 835 |
+
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
| 836 |
+
angles += angles[:1] # Close the loop
|
| 837 |
+
|
| 838 |
+
# Create figure
|
| 839 |
+
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
|
| 840 |
+
|
| 841 |
+
# Add lines for each model
|
| 842 |
+
for i, (model, metrics) in enumerate(df.iterrows()):
|
| 843 |
+
values = metrics.values.flatten().tolist()
|
| 844 |
+
values += values[:1] # Close the loop
|
| 845 |
+
|
| 846 |
+
# Plot the line
|
| 847 |
+
ax.plot(angles, values, linewidth=2, linestyle='solid',
|
| 848 |
+
label=model, color=self.colors[i % len(self.colors)])
|
| 849 |
+
ax.fill(angles, values, alpha=0.1, color=self.colors[i % len(self.colors)])
|
| 850 |
+
|
| 851 |
+
# Set category labels
|
| 852 |
+
plt.xticks(angles[:-1], categories)
|
| 853 |
+
|
| 854 |
+
# Set y-axis limits
|
| 855 |
+
plt.ylim(0, 1)
|
| 856 |
+
|
| 857 |
+
# Add legend
|
| 858 |
+
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
|
| 859 |
+
|
| 860 |
+
# Set title
|
| 861 |
+
plt.title(title or "Metrics Comparison Across Models")
|
| 862 |
+
|
| 863 |
+
# Save if requested
|
| 864 |
+
if save_path:
|
| 865 |
+
full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path
|
| 866 |
+
plt.savefig(full_path, bbox_inches='tight',
|
models/base_models.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# recursive_swe_bench/models/base_model.py
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict, List, Optional, Union
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
|
| 8 |
+
class ModelInterface(ABC):
|
| 9 |
+
"""
|
| 10 |
+
Base interface for models that can be evaluated using Recursive-SWE-bench.
|
| 11 |
+
|
| 12 |
+
This abstract class defines the core functionality required for a model to
|
| 13 |
+
be evaluated using the recursive evaluation framework. Concrete implementations
|
| 14 |
+
must provide the actual model-specific logic.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, model_identifier: str, config: Optional[Dict[str, Any]] = None):
|
| 18 |
+
"""
|
| 19 |
+
Initialize the model interface.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
model_identifier: Identifier for the model
|
| 23 |
+
config: Configuration options
|
| 24 |
+
"""
|
| 25 |
+
self.model_identifier = model_identifier
|
| 26 |
+
self.config = config or {}
|
| 27 |
+
self.logger = self._setup_logger()
|
| 28 |
+
|
| 29 |
+
def _setup_logger(self) -> logging.Logger:
|
| 30 |
+
"""Set up logging for the model."""
|
| 31 |
+
logger = logging.getLogger(f"Model.{self.model_identifier}")
|
| 32 |
+
handler = logging.StreamHandler()
|
| 33 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 34 |
+
handler.setFormatter(formatter)
|
| 35 |
+
logger.addHandler(handler)
|
| 36 |
+
logger.setLevel(self.config.get("log_level", logging.INFO))
|
| 37 |
+
return logger
|
| 38 |
+
|
| 39 |
+
@abstractmethod
|
| 40 |
+
def solve(self, problem: Dict[str, Any], history: Optional[List[Dict[str, Any]]] = None) -> str:
|
| 41 |
+
"""
|
| 42 |
+
Generate a solution for the given problem.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
problem: The problem to solve
|
| 46 |
+
history: Optional history of previous solution attempts
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
The generated solution
|
| 50 |
+
"""
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
@abstractmethod
|
| 54 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
| 55 |
+
"""
|
| 56 |
+
Get meta information about the model.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary containing model information
|
| 60 |
+
"""
|
| 61 |
+
pass
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# recursive_swe_bench/models/openai.py
|
| 65 |
+
|
| 66 |
+
import openai
|
| 67 |
+
import json
|
| 68 |
+
import backoff
|
| 69 |
+
from typing import Any, Dict, List, Optional, Union
|
| 70 |
+
|
| 71 |
+
from recursive_swe_bench.models.base_model import ModelInterface
|
| 72 |
+
|
| 73 |
+
class OpenAIModel(ModelInterface):
|
| 74 |
+
"""
|
| 75 |
+
Integration with OpenAI models (GPT-3.5, GPT-4, etc.).
|
| 76 |
+
|
| 77 |
+
This class provides integration with OpenAI's API for evaluating
|
| 78 |
+
models like GPT-3.5 and GPT-4 with Recursive-SWE-bench.
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
def __init__(
|
| 82 |
+
self,
|
| 83 |
+
model_identifier: str,
|
| 84 |
+
api_key: Optional[str] = None,
|
| 85 |
+
config: Optional[Dict[str, Any]] = None
|
| 86 |
+
):
|
| 87 |
+
"""
|
| 88 |
+
Initialize the OpenAI model interface.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
model_identifier: OpenAI model identifier (e.g., "gpt-4", "gpt-3.5-turbo")
|
| 92 |
+
api_key: OpenAI API key (optional if set in environment)
|
| 93 |
+
config: Additional configuration options
|
| 94 |
+
"""
|
| 95 |
+
super().__init__(model_identifier, config)
|
| 96 |
+
|
| 97 |
+
# Set API key if provided
|
| 98 |
+
if api_key:
|
| 99 |
+
openai.api_key = api_key
|
| 100 |
+
|
| 101 |
+
# Load default prompts or use config-provided ones
|
| 102 |
+
self.prompts = self.config.get("prompts", {
|
| 103 |
+
"system": "You are an expert programmer tasked with fixing bugs in code. Fix the code based on the description and tests.",
|
| 104 |
+
"user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Provide only the corrected code without any explanations.",
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
# Configure API parameters
|
| 108 |
+
self.api_params = self.config.get("api_params", {
|
| 109 |
+
"temperature": 0.2,
|
| 110 |
+
"max_tokens": 2000,
|
| 111 |
+
"top_p": 0.95,
|
| 112 |
+
"frequency_penalty": 0,
|
| 113 |
+
"presence_penalty": 0,
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
self.logger.info(f"Initialized OpenAI model: {model_identifier}")
|
| 117 |
+
|
| 118 |
+
@backoff.on_exception(
|
| 119 |
+
backoff.expo,
|
| 120 |
+
(openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError),
|
| 121 |
+
max_tries=5
|
| 122 |
+
)
|
| 123 |
+
def solve(
|
| 124 |
+
self,
|
| 125 |
+
problem: Dict[str, Any],
|
| 126 |
+
history: Optional[List[Dict[str, Any]]] = None
|
| 127 |
+
) -> str:
|
| 128 |
+
"""
|
| 129 |
+
Generate a solution using the OpenAI model.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
problem: The problem to solve
|
| 133 |
+
history: Optional history of previous solution attempts
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
The generated solution
|
| 137 |
+
"""
|
| 138 |
+
self.logger.info(f"Solving problem with OpenAI model: {self.model_identifier}")
|
| 139 |
+
start_time = time.time()
|
| 140 |
+
|
| 141 |
+
# Format the problem for the model
|
| 142 |
+
messages = self._format_messages(problem, history)
|
| 143 |
+
|
| 144 |
+
# Make API call
|
| 145 |
+
response = openai.ChatCompletion.create(
|
| 146 |
+
model=self.model_identifier,
|
| 147 |
+
messages=messages,
|
| 148 |
+
**self.api_params
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Extract the solution from the response
|
| 152 |
+
solution = response.choices[0].message.content.strip()
|
| 153 |
+
|
| 154 |
+
end_time = time.time()
|
| 155 |
+
self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds")
|
| 156 |
+
|
| 157 |
+
return self._extract_code(solution)
|
| 158 |
+
|
| 159 |
+
def _format_messages(
|
| 160 |
+
self,
|
| 161 |
+
problem: Dict[str, Any],
|
| 162 |
+
history: Optional[List[Dict[str, Any]]] = None
|
| 163 |
+
) -> List[Dict[str, str]]:
|
| 164 |
+
"""
|
| 165 |
+
Format the problem and history into messages for the OpenAI API.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
problem: The problem to solve
|
| 169 |
+
history: Optional history of previous solution attempts
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List of formatted messages
|
| 173 |
+
"""
|
| 174 |
+
messages = [
|
| 175 |
+
{"role": "system", "content": self.prompts["system"]}
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
# Format the user message
|
| 179 |
+
code = problem["code_context"]["code"]
|
| 180 |
+
|
| 181 |
+
# Prepare tests description
|
| 182 |
+
tests_description = "# Tests\n"
|
| 183 |
+
if "tests" in problem["code_context"]:
|
| 184 |
+
tests_description += "The code must pass the following tests:\n\n"
|
| 185 |
+
for i, test in enumerate(problem["code_context"]["tests"]):
|
| 186 |
+
tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n"
|
| 187 |
+
else:
|
| 188 |
+
tests_description += "The code must work correctly according to its intended functionality."
|
| 189 |
+
|
| 190 |
+
# Create the user message using the template
|
| 191 |
+
user_content = self.prompts["user_template"].format(
|
| 192 |
+
description=problem["description"],
|
| 193 |
+
code=code,
|
| 194 |
+
tests_description=tests_description
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
messages.append({"role": "user", "content": user_content})
|
| 198 |
+
|
| 199 |
+
# Add history if available
|
| 200 |
+
if history and self.config.get("include_history", True):
|
| 201 |
+
for entry in history:
|
| 202 |
+
# Add previous attempt
|
| 203 |
+
messages.append({
|
| 204 |
+
"role": "assistant",
|
| 205 |
+
"content": entry["solution"]
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
# Add feedback on previous attempt
|
| 209 |
+
feedback_content = f"Your solution has the following issues:\n"
|
| 210 |
+
for issue in entry["feedback"]["issues"]:
|
| 211 |
+
feedback_content += f"- {issue['message']}\n"
|
| 212 |
+
|
| 213 |
+
feedback_content += "\nPlease try again with these improvements:\n"
|
| 214 |
+
for suggestion in entry["feedback"]["suggestions"]:
|
| 215 |
+
feedback_content += f"- {suggestion['message']}\n"
|
| 216 |
+
|
| 217 |
+
messages.append({
|
| 218 |
+
"role": "user",
|
| 219 |
+
"content": feedback_content
|
| 220 |
+
})
|
| 221 |
+
|
| 222 |
+
return messages
|
| 223 |
+
|
| 224 |
+
def _extract_code(self, text: str) -> str:
|
| 225 |
+
"""
|
| 226 |
+
Extract code from the model's response.
|
| 227 |
+
|
| 228 |
+
Args:
|
| 229 |
+
text: The model's response
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
Extracted code
|
| 233 |
+
"""
|
| 234 |
+
# Try to extract code from markdown code blocks
|
| 235 |
+
import re
|
| 236 |
+
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL)
|
| 237 |
+
|
| 238 |
+
if code_blocks:
|
| 239 |
+
return code_blocks[0].strip()
|
| 240 |
+
|
| 241 |
+
# If no code blocks, return the full text (it might be just code)
|
| 242 |
+
return text.strip()
|
| 243 |
+
|
| 244 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
| 245 |
+
"""
|
| 246 |
+
Get meta information about the model.
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Dictionary containing model information
|
| 250 |
+
"""
|
| 251 |
+
return {
|
| 252 |
+
"model_name": self.model_identifier,
|
| 253 |
+
"provider": "OpenAI",
|
| 254 |
+
"type": "API",
|
| 255 |
+
"parameters": self.api_params,
|
| 256 |
+
"system_prompt": self.prompts["system"]
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
|
task_generators/bug_fixing.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|