ShauryaDamathia commited on
Commit
ea41fe3
·
verified ·
1 Parent(s): 2dbf6aa

Upload 11 files

Browse files
Files changed (11) hide show
  1. .env.example +2 -0
  2. Dataset.json +452 -0
  3. Dockerfile +16 -0
  4. README.md +92 -10
  5. agent_contract.py +40 -0
  6. app.py +87 -0
  7. environment.py +58 -0
  8. grader.py +129 -0
  9. openenv.yaml +68 -0
  10. requirements.txt +2 -0
  11. test_grader.py +359 -0
.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ AGENT_API_KEY=your_api_key_here
2
+ AGENT_API_URL=https://api.openai.com/v1/chat/completions
Dataset.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "log": "Failed password for invalid user admin from 185.234.217.92 port 49822 ssh2",
4
+ "system": "SSH",
5
+ "expected": {
6
+ "category": "brute_force",
7
+ "severity": "medium",
8
+ "action": "block IP and enable rate limiting"
9
+ }
10
+ },
11
+ {
12
+ "log": "Accepted publickey for ubuntu from 10.0.0.14 port 42111 ssh2",
13
+ "system": "SSH",
14
+ "expected": {
15
+ "category": "normal",
16
+ "severity": "low",
17
+ "action": "no action required"
18
+ }
19
+ },
20
+ {
21
+ "log": "POST /api/auth HTTP/1.1 401 Unauthorized from 203.91.112.44",
22
+ "system": "web server",
23
+ "expected": {
24
+ "category": "brute_force",
25
+ "severity": "medium",
26
+ "action": "enable rate limiting and captcha"
27
+ }
28
+ },
29
+ {
30
+ "log": "GET /home HTTP/1.1 200 OK from 192.168.0.22",
31
+ "system": "web server",
32
+ "expected": {
33
+ "category": "normal",
34
+ "severity": "low",
35
+ "action": "no action required"
36
+ }
37
+ },
38
+ {
39
+ "log": "User root authentication failures exceeded threshold from 45.77.23.11",
40
+ "system": "SSH",
41
+ "expected": {
42
+ "category": "brute_force",
43
+ "severity": "high",
44
+ "action": "disable root login and block source IP"
45
+ }
46
+ },
47
+ {
48
+ "log": "Suspicious upload detected: .php file via /images endpoint",
49
+ "system": "web server",
50
+ "expected": {
51
+ "category": "malware",
52
+ "severity": "high",
53
+ "action": "remove file and restrict upload types"
54
+ }
55
+ },
56
+ {
57
+ "log": "Database connection established from app node 10.1.2.3",
58
+ "system": "database",
59
+ "expected": {
60
+ "category": "normal",
61
+ "severity": "low",
62
+ "action": "no action required"
63
+ }
64
+ },
65
+ {
66
+ "log": "Outbound SMTP traffic spike detected for user sales@company.com",
67
+ "system": "email",
68
+ "expected": {
69
+ "category": "phishing",
70
+ "severity": "high",
71
+ "action": "suspend account and inspect sent emails"
72
+ }
73
+ },
74
+ {
75
+ "log": "High frequency requests to /search endpoint from 103.56.98.21",
76
+ "system": "web server",
77
+ "expected": {
78
+ "category": "dos_attack",
79
+ "severity": "medium",
80
+ "action": "apply rate limiting and block IP"
81
+ }
82
+ },
83
+ {
84
+ "log": "User analyst logged into database from 10.2.3.5",
85
+ "system": "database",
86
+ "expected": {
87
+ "category": "normal",
88
+ "severity": "low",
89
+ "action": "no action required"
90
+ }
91
+ },
92
+ {
93
+ "log": "Email attachment with macro blocked from unknown sender",
94
+ "system": "email",
95
+ "expected": {
96
+ "category": "malware",
97
+ "severity": "medium",
98
+ "action": "quarantine email and alert user"
99
+ }
100
+ },
101
+ {
102
+ "log": "Repeated AUTH failures for mailbox admin@corp.com from 51.158.32.9",
103
+ "system": "email",
104
+ "expected": {
105
+ "category": "brute_force",
106
+ "severity": "medium",
107
+ "action": "block IP and enforce MFA"
108
+ }
109
+ },
110
+ {
111
+ "log": "GET /admin HTTP/1.1 403 Forbidden from 198.18.1.2",
112
+ "system": "web server",
113
+ "expected": {
114
+ "category": "normal",
115
+ "severity": "low",
116
+ "action": "monitor for repeated attempts"
117
+ }
118
+ },
119
+ {
120
+ "log": "Slow query detected: SELECT * FROM payments exceeding threshold",
121
+ "system": "database",
122
+ "expected": {
123
+ "category": "normal",
124
+ "severity": "medium",
125
+ "action": "optimize query and indexes"
126
+ }
127
+ },
128
+ {
129
+ "log": "SSH connection attempt using deprecated protocol version 1",
130
+ "system": "SSH",
131
+ "expected": {
132
+ "category": "malware",
133
+ "severity": "medium",
134
+ "action": "disable legacy protocol support"
135
+ }
136
+ },
137
+ {
138
+ "log": "Inbound email flagged for spoofed domain billing@secure-payments.co",
139
+ "system": "email",
140
+ "expected": {
141
+ "category": "phishing",
142
+ "severity": "high",
143
+ "action": "block sender domain and notify users"
144
+ }
145
+ },
146
+ {
147
+ "log": "GET /api/status HTTP/1.1 200 OK from 10.0.0.7",
148
+ "system": "web server",
149
+ "expected": {
150
+ "category": "normal",
151
+ "severity": "low",
152
+ "action": "no action required"
153
+ }
154
+ },
155
+ {
156
+ "log": "Multiple DB connections opened from 192.168.3.14 in short interval",
157
+ "system": "database",
158
+ "expected": {
159
+ "category": "dos_attack",
160
+ "severity": "medium",
161
+ "action": "limit connections per IP"
162
+ }
163
+ },
164
+ {
165
+ "log": "Account locked after consecutive failed SSH logins",
166
+ "system": "SSH",
167
+ "expected": {
168
+ "category": "brute_force",
169
+ "severity": "medium",
170
+ "action": "investigate IP and enable MFA"
171
+ }
172
+ },
173
+ {
174
+ "log": "Outbound email rate anomaly detected for hr@company.com",
175
+ "system": "email",
176
+ "expected": {
177
+ "category": "phishing",
178
+ "severity": "high",
179
+ "action": "disable account and audit activity"
180
+ }
181
+ },
182
+ {
183
+ "log": "Checksum mismatch detected for /usr/sbin/sshd",
184
+ "system": "SSH",
185
+ "expected": {
186
+ "category": "malware",
187
+ "severity": "high",
188
+ "action": "restore binary and investigate compromise"
189
+ }
190
+ },
191
+ {
192
+ "log": "GET /robots.txt HTTP/1.1 200 OK from 66.249.65.10",
193
+ "system": "web server",
194
+ "expected": {
195
+ "category": "normal",
196
+ "severity": "low",
197
+ "action": "no action required"
198
+ }
199
+ },
200
+ {
201
+ "log": "Database login failed for admin from 77.91.12.33",
202
+ "system": "database",
203
+ "expected": {
204
+ "category": "brute_force",
205
+ "severity": "medium",
206
+ "action": "restrict access and rotate credentials"
207
+ }
208
+ },
209
+ {
210
+ "log": "Email contains suspicious shortened URL",
211
+ "system": "email",
212
+ "expected": {
213
+ "category": "phishing",
214
+ "severity": "medium",
215
+ "action": "block URL and warn recipient"
216
+ }
217
+ },
218
+ {
219
+ "log": "Traffic surge detected on /login endpoint",
220
+ "system": "web server",
221
+ "expected": {
222
+ "category": "dos_attack",
223
+ "severity": "medium",
224
+ "action": "enable throttling and monitoring"
225
+ }
226
+ },
227
+ {
228
+ "log": "SSH session closed for user ec2-user",
229
+ "system": "SSH",
230
+ "expected": {
231
+ "category": "normal",
232
+ "severity": "low",
233
+ "action": "no action required"
234
+ }
235
+ },
236
+ {
237
+ "log": "Execution of unknown binary from /tmp/.x9",
238
+ "system": "database",
239
+ "expected": {
240
+ "category": "malware",
241
+ "severity": "high",
242
+ "action": "remove binary and scan system"
243
+ }
244
+ },
245
+ {
246
+ "log": "Incoming email rejected due to SPF validation failure",
247
+ "system": "email",
248
+ "expected": {
249
+ "category": "phishing",
250
+ "severity": "medium",
251
+ "action": "update SPF rules and monitor sender"
252
+ }
253
+ },
254
+ {
255
+ "log": "GET /login HTTP/1.1 200 OK from 172.16.1.2",
256
+ "system": "web server",
257
+ "expected": {
258
+ "category": "normal",
259
+ "severity": "low",
260
+ "action": "no action required"
261
+ }
262
+ },
263
+ {
264
+ "log": "Too many DB connections from 10.10.5.6 causing slowdown",
265
+ "system": "database",
266
+ "expected": {
267
+ "category": "dos_attack",
268
+ "severity": "high",
269
+ "action": "block IP and enforce connection limits"
270
+ }
271
+ },
272
+ {
273
+ "log": "SSH login attempt with invalid key for user admin",
274
+ "system": "SSH",
275
+ "expected": {
276
+ "category": "brute_force",
277
+ "severity": "medium",
278
+ "action": "disable password auth and monitor attempts"
279
+ }
280
+ },
281
+ {
282
+ "log": "Injected script detected in HTTP response payload",
283
+ "system": "web server",
284
+ "expected": {
285
+ "category": "malware",
286
+ "severity": "high",
287
+ "action": "sanitize inputs and deploy WAF"
288
+ }
289
+ },
290
+ {
291
+ "log": "Email impersonation attempt detected for ceo@company.com",
292
+ "system": "email",
293
+ "expected": {
294
+ "category": "phishing",
295
+ "severity": "high",
296
+ "action": "alert users and block sender"
297
+ }
298
+ },
299
+ {
300
+ "log": "Long running UPDATE query detected in orders table",
301
+ "system": "database",
302
+ "expected": {
303
+ "category": "normal",
304
+ "severity": "medium",
305
+ "action": "optimize query performance"
306
+ }
307
+ },
308
+ {
309
+ "log": "SYN flood pattern detected from 91.200.12.5",
310
+ "system": "web server",
311
+ "expected": {
312
+ "category": "dos_attack",
313
+ "severity": "high",
314
+ "action": "enable SYN protection and block IP"
315
+ }
316
+ },
317
+ {
318
+ "log": "SSH key authentication successful for user deploy",
319
+ "system": "SSH",
320
+ "expected": {
321
+ "category": "normal",
322
+ "severity": "low",
323
+ "action": "no action required"
324
+ }
325
+ },
326
+ {
327
+ "log": "Unauthorized schema change detected in production DB",
328
+ "system": "database",
329
+ "expected": {
330
+ "category": "malware",
331
+ "severity": "high",
332
+ "action": "audit changes and restore if needed"
333
+ }
334
+ },
335
+ {
336
+ "log": "Executable attachment blocked in incoming email",
337
+ "system": "email",
338
+ "expected": {
339
+ "category": "malware",
340
+ "severity": "medium",
341
+ "action": "quarantine attachment"
342
+ }
343
+ },
344
+ {
345
+ "log": "Repeated POST requests to /wp-login.php from multiple IPs",
346
+ "system": "web server",
347
+ "expected": {
348
+ "category": "brute_force",
349
+ "severity": "high",
350
+ "action": "block IPs and enable login protection"
351
+ }
352
+ },
353
+ {
354
+ "log": "Database connection timeout from application server",
355
+ "system": "database",
356
+ "expected": {
357
+ "category": "normal",
358
+ "severity": "medium",
359
+ "action": "check DB load and connectivity"
360
+ }
361
+ },
362
+ {
363
+ "log": "Email link redirecting to suspicious domain detected",
364
+ "system": "email",
365
+ "expected": {
366
+ "category": "phishing",
367
+ "severity": "high",
368
+ "action": "block link and notify users"
369
+ }
370
+ },
371
+ {
372
+ "log": "Unusual connection pattern on SSH port 22",
373
+ "system": "SSH",
374
+ "expected": {
375
+ "category": "dos_attack",
376
+ "severity": "medium",
377
+ "action": "limit connections and enable firewall"
378
+ }
379
+ },
380
+ {
381
+ "log": "GET /healthcheck HTTP/1.1 200 OK from 127.0.0.1",
382
+ "system": "web server",
383
+ "expected": {
384
+ "category": "normal",
385
+ "severity": "low",
386
+ "action": "no action required"
387
+ }
388
+ },
389
+ {
390
+ "log": "Unauthorized SELECT attempt on restricted table payroll",
391
+ "system": "database",
392
+ "expected": {
393
+ "category": "malware",
394
+ "severity": "high",
395
+ "action": "revoke permissions and audit logs"
396
+ }
397
+ },
398
+ {
399
+ "log": "High bounce rate detected for outgoing emails",
400
+ "system": "email",
401
+ "expected": {
402
+ "category": "phishing",
403
+ "severity": "medium",
404
+ "action": "investigate account compromise"
405
+ }
406
+ },
407
+ {
408
+ "log": "Failed SSH login for user test from 82.102.44.2",
409
+ "system": "SSH",
410
+ "expected": {
411
+ "category": "brute_force",
412
+ "severity": "low",
413
+ "action": "monitor activity"
414
+ }
415
+ },
416
+ {
417
+ "log": "GET /api/data HTTP/1.1 500 Internal Server Error",
418
+ "system": "web server",
419
+ "expected": {
420
+ "category": "normal",
421
+ "severity": "medium",
422
+ "action": "check application logs and fix error"
423
+ }
424
+ },
425
+ {
426
+ "log": "Scheduled database backup completed successfully",
427
+ "system": "database",
428
+ "expected": {
429
+ "category": "normal",
430
+ "severity": "low",
431
+ "action": "verify backup integrity"
432
+ }
433
+ },
434
+ {
435
+ "log": "Email flagged due to mismatched sender domain",
436
+ "system": "email",
437
+ "expected": {
438
+ "category": "phishing",
439
+ "severity": "medium",
440
+ "action": "block sender and educate users"
441
+ }
442
+ },
443
+ {
444
+ "log": "CPU spike observed during HTTP traffic surge",
445
+ "system": "web server",
446
+ "expected": {
447
+ "category": "dos_attack",
448
+ "severity": "high",
449
+ "action": "scale resources and filter traffic"
450
+ }
451
+ }
452
+ ]
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ COPY requirements.txt .
9
+
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ EXPOSE 7860
15
+
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,92 @@
1
- ---
2
- title: Security Log Analysis OpenENV
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Meta Hackathon OpenEnv - Cyber Security Log Analysis
2
+
3
+ This project exposes a small cybersecurity log-analysis environment with a FastAPI
4
+ server, a local environment class, and an agent evaluation script.
5
+
6
+ ## Output contract
7
+
8
+ The agent must return only JSON with exactly these keys:
9
+
10
+ ```json
11
+ {
12
+ "category": "brute_force",
13
+ "severity": "high",
14
+ "action": "block source IP and enable rate limiting"
15
+ }
16
+ ```
17
+
18
+ Allowed `category` values:
19
+
20
+ - `brute_force`
21
+ - `malware`
22
+ - `phishing`
23
+ - `dos_attack`
24
+ - `normal`
25
+
26
+ Allowed `severity` values:
27
+
28
+ - `low`
29
+ - `medium`
30
+ - `high`
31
+
32
+ `action` should be a short, concrete mitigation step.
33
+
34
+ ## API endpoints
35
+
36
+ - `GET /reset` returns a random sample plus `instructions`, `allowed_categories`,
37
+ `allowed_severities`, `response_example`, and `agent_prompt`.
38
+ - `POST /step` accepts the agent JSON payload and returns the normalized reward.
39
+ - `GET /state` returns the current step count.
40
+ - `GET /tasks` describes the task tiers and output contract.
41
+ - `POST /grader` scores a `predicted` payload against an `expected` payload.
42
+ - `GET /baseline` runs one simple baseline action against a fresh sample.
43
+
44
+ ## Local setup
45
+
46
+ ```bash
47
+ python -m venv .venv
48
+ . .venv/Scripts/activate
49
+ pip install -r requirements.txt
50
+ uvicorn app:app --host 0.0.0.0 --port 7860
51
+ ```
52
+
53
+ For PowerShell activation, use:
54
+
55
+ ```powershell
56
+ .venv\Scripts\Activate.ps1
57
+ ```
58
+
59
+ ## Agent evaluation runner
60
+
61
+ `test_grader.py` is the local runner that calls a chat completions API, parses the
62
+ model output, grades it, and appends a record to `agent_eval_log.jsonl`.
63
+
64
+ Create a local `.env` file with:
65
+
66
+ ```env
67
+ AGENT_API_KEY=your_api_key
68
+ AGENT_API_URL=https://api.openai.com/v1/chat/completions
69
+ ```
70
+
71
+ Then run:
72
+
73
+ ```bash
74
+ python test_grader.py
75
+ ```
76
+
77
+ ## Docker
78
+
79
+ Build and run the API container with:
80
+
81
+ ```bash
82
+ docker build -t security-log-env .
83
+ docker run --rm -p 7860:7860 security-log-env
84
+ ```
85
+
86
+ ## Scoring
87
+
88
+ Scoring uses cosine similarity between vectorized predicted and expected responses.
89
+ The raw cosine value is mapped from `[-1, 1]` into the reward range `[0, 1]`:
90
+
91
+ - aligned vectors score `1`
92
+ - opposite vectors score `0`
agent_contract.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ ALLOWED_CATEGORIES = ["brute_force", "malware", "phishing", "dos_attack", "normal"]
4
+ ALLOWED_SEVERITIES = ["low", "medium", "high"]
5
+ RESPONSE_EXAMPLE = {
6
+ "category": "normal",
7
+ "severity": "low",
8
+ "action": "monitor activity"
9
+ }
10
+
11
+ BASE_AGENT_INSTRUCTIONS = (
12
+ "You are a cybersecurity log analysis agent. Analyze the provided system name "
13
+ "and log entry, then return only a JSON object with exactly three keys: "
14
+ "`category`, `severity`, and `action`. "
15
+ f"`category` must be one of {ALLOWED_CATEGORIES}. "
16
+ f"`severity` must be one of {ALLOWED_SEVERITIES}. "
17
+ "`action` should be a short, concrete mitigation step. "
18
+ "Do not add markdown, code fences, explanations, or extra keys."
19
+ )
20
+
21
+
22
+ def build_agent_prompt(log, system):
23
+ example_json = json.dumps(RESPONSE_EXAMPLE)
24
+ return (
25
+ f"{BASE_AGENT_INSTRUCTIONS}\n\n"
26
+ f"System: {system}\n"
27
+ f"Log: {log}\n\n"
28
+ "Return a response in this shape:\n"
29
+ f"{example_json}"
30
+ )
31
+
32
+
33
+ def build_agent_context(log, system):
34
+ return {
35
+ "instructions": BASE_AGENT_INSTRUCTIONS,
36
+ "allowed_categories": ALLOWED_CATEGORIES,
37
+ "allowed_severities": ALLOWED_SEVERITIES,
38
+ "response_example": RESPONSE_EXAMPLE,
39
+ "agent_prompt": build_agent_prompt(log, system),
40
+ }
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ from agent_contract import (
4
+ ALLOWED_CATEGORIES,
5
+ ALLOWED_SEVERITIES,
6
+ BASE_AGENT_INSTRUCTIONS,
7
+ RESPONSE_EXAMPLE,
8
+ )
9
+ from environment import SecurityEnv
10
+ from grader import grade_response
11
+
12
+
13
+ app = FastAPI()
14
+ env = SecurityEnv()
15
+
16
+
17
+ @app.get("/reset")
18
+ def reset():
19
+ return env.reset()
20
+
21
+
22
+ @app.post("/step")
23
+ def step(action: dict):
24
+ # ✅ Safe extraction (prevents crashes)
25
+ safe_action = {
26
+ "category": str(action.get("category", "")).strip().lower(),
27
+ "severity": str(action.get("severity", "")).strip().lower(),
28
+ "action": str(action.get("action", "")).strip()
29
+ }
30
+
31
+ return env.step(safe_action)
32
+
33
+
34
+ @app.get("/state")
35
+ def state():
36
+ return env.state()
37
+
38
+
39
+ @app.get("/tasks")
40
+ def tasks():
41
+ return {
42
+ "tasks": [
43
+ {
44
+ "name": "easy",
45
+ "description": "Detect normal vs attack",
46
+ },
47
+ {
48
+ "name": "medium",
49
+ "description": "Classify category",
50
+ },
51
+ {
52
+ "name": "hard",
53
+ "description": "Category + severity + action",
54
+ },
55
+ ],
56
+ "output_contract": {
57
+ "instructions": BASE_AGENT_INSTRUCTIONS,
58
+ "allowed_categories": ALLOWED_CATEGORIES,
59
+ "allowed_severities": ALLOWED_SEVERITIES,
60
+ "response_example": RESPONSE_EXAMPLE,
61
+ },
62
+ }
63
+
64
+
65
+ @app.post("/grader")
66
+ def grader(data: dict):
67
+ predicted = data["predicted"]
68
+ expected = data["expected"]
69
+ score = grade_response(predicted, expected)
70
+
71
+ return {"score": score}
72
+
73
+
74
+ @app.get("/baseline")
75
+ def baseline():
76
+ sample = env.reset()
77
+ action = {
78
+ "category": "normal",
79
+ "severity": "low",
80
+ "action": "monitor",
81
+ }
82
+ result = env.step(action)
83
+
84
+ return {
85
+ "observation": sample,
86
+ "result": result,
87
+ }
environment.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from pathlib import Path
4
+
5
+ from agent_contract import build_agent_context
6
+ from grader import grade_response
7
+
8
+
9
+ BASE_DIR = Path(__file__).resolve().parent
10
+ DEFAULT_DATASET_PATH = BASE_DIR / "Dataset.json"
11
+
12
+
13
+ class SecurityEnv:
14
+ def __init__(self, dataset_path=None):
15
+ dataset_file = Path(dataset_path) if dataset_path else DEFAULT_DATASET_PATH
16
+ if not dataset_file.is_absolute():
17
+ dataset_file = BASE_DIR / dataset_file
18
+
19
+ with dataset_file.open("r", encoding="utf-8") as handle:
20
+ self.data = json.load(handle)
21
+
22
+ self.current_sample = None
23
+ self.step_count = 0
24
+
25
+ def reset(self):
26
+ self.current_sample = random.choice(self.data)
27
+ self.step_count = 0
28
+
29
+ observation = {
30
+ "log": self.current_sample["log"],
31
+ "system": self.current_sample["system"],
32
+ }
33
+ observation.update(
34
+ build_agent_context(
35
+ log=self.current_sample["log"],
36
+ system=self.current_sample["system"],
37
+ )
38
+ )
39
+ return observation
40
+
41
+ def step(self, action):
42
+ expected = self.current_sample["expected"]
43
+ reward = grade_response(action, expected)
44
+ self.step_count += 1
45
+
46
+ return {
47
+ "observation": None,
48
+ "reward": reward,
49
+ "done": True,
50
+ "info": {
51
+ "expected": expected,
52
+ },
53
+ }
54
+
55
+ def state(self):
56
+ return {
57
+ "step_count": self.step_count,
58
+ }
grader.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import math
4
+ import re
5
+
6
+
7
+ CATEGORIES = ["brute_force", "malware", "phishing", "dos_attack", "normal"]
8
+ SEVERITY_ANGLES = {
9
+ "low": math.radians(150),
10
+ "medium": math.radians(90),
11
+ "high": math.radians(30),
12
+ }
13
+
14
+ CATEGORY_WEIGHT = 0.3
15
+ SEVERITY_WEIGHT = 0.2
16
+ ACTION_WEIGHT = 0.5
17
+ ACTION_VECTOR_SIZE = 128
18
+ TOKEN_PATTERN = re.compile(r"[a-z0-9_]+")
19
+
20
+
21
+ def _safe_text(value):
22
+ return str(value or "").strip().lower()
23
+
24
+
25
+ def _normalize_payload(payload):
26
+ if isinstance(payload, str):
27
+ try:
28
+ payload = json.loads(payload)
29
+ except json.JSONDecodeError:
30
+ payload = {"action": payload}
31
+
32
+ if not isinstance(payload, dict):
33
+ payload = {}
34
+
35
+ return {
36
+ "category": _safe_text(payload.get("category")),
37
+ "severity": _safe_text(payload.get("severity")),
38
+ "action": _safe_text(payload.get("action")),
39
+ }
40
+
41
+
42
+ def _normalize_vector(values):
43
+ norm = math.sqrt(sum(value * value for value in values))
44
+ if norm == 0.0:
45
+ return values
46
+
47
+ return [value / norm for value in values]
48
+
49
+
50
+ def _centered_one_hot(value, vocabulary):
51
+ if value not in vocabulary:
52
+ return [0.0] * len(vocabulary)
53
+
54
+ off_value = -1.0 / (len(vocabulary) - 1)
55
+ vector = [off_value] * len(vocabulary)
56
+ vector[vocabulary.index(value)] = 1.0
57
+ return _normalize_vector(vector)
58
+
59
+
60
+ def _severity_vector(value):
61
+ angle = SEVERITY_ANGLES.get(value)
62
+ if angle is None:
63
+ return [0.0, 0.0]
64
+
65
+ return [math.cos(angle), math.sin(angle)]
66
+
67
+
68
+ def _hash_feature(feature):
69
+ digest = hashlib.sha256(feature.encode("utf-8")).digest()
70
+ index = int.from_bytes(digest[:4], "big") % ACTION_VECTOR_SIZE
71
+ sign = 1.0 if digest[4] % 2 == 0 else -1.0
72
+ return index, sign
73
+
74
+
75
+ def _action_vector(action):
76
+ tokens = TOKEN_PATTERN.findall(action)
77
+ if not tokens:
78
+ return [0.0] * ACTION_VECTOR_SIZE
79
+
80
+ vector = [0.0] * ACTION_VECTOR_SIZE
81
+
82
+ for token in tokens:
83
+ index, sign = _hash_feature(token)
84
+ vector[index] += sign
85
+
86
+ for left, right in zip(tokens, tokens[1:]):
87
+ index, sign = _hash_feature(f"{left}_{right}")
88
+ vector[index] += 0.5 * sign
89
+
90
+ return _normalize_vector(vector)
91
+
92
+
93
+ def response_to_vector(payload):
94
+ normalized = _normalize_payload(payload)
95
+ category_vector = [
96
+ value * CATEGORY_WEIGHT
97
+ for value in _centered_one_hot(normalized["category"], CATEGORIES)
98
+ ]
99
+ severity_vector = [
100
+ value * SEVERITY_WEIGHT
101
+ for value in _severity_vector(normalized["severity"])
102
+ ]
103
+ action_vector = [
104
+ value * ACTION_WEIGHT
105
+ for value in _action_vector(normalized["action"])
106
+ ]
107
+
108
+ return category_vector + severity_vector + action_vector
109
+
110
+
111
+ def cosine_similarity(left, right):
112
+ dot = sum(left_value * right_value for left_value, right_value in zip(left, right))
113
+ left_norm = math.sqrt(sum(value * value for value in left))
114
+ right_norm = math.sqrt(sum(value * value for value in right))
115
+
116
+ if left_norm == 0.0 or right_norm == 0.0:
117
+ return 0.0
118
+
119
+ cosine = dot / (left_norm * right_norm)
120
+ return max(-1.0, min(1.0, cosine))
121
+
122
+
123
+ def grade_response(predicted, expected):
124
+ predicted_vector = response_to_vector(predicted)
125
+ expected_vector = response_to_vector(expected)
126
+ cosine = cosine_similarity(predicted_vector, expected_vector)
127
+
128
+ # Maps cosine similarity from [-1, 1] into the required reward range [0, 1].
129
+ return round((cosine + 1.0) / 2.0, 4)
openenv.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: security-log-env
2
+ description: >
3
+ A reinforcement learning environment for cybersecurity log analysis.
4
+ The agent must analyze system logs and return a JSON answer with the keys
5
+ category, severity, and action. Scoring is based on cosine similarity
6
+ between vectorized expected and predicted responses, normalized to [0, 1].
7
+
8
+ version: 1.0.0
9
+
10
+ tasks:
11
+ - name: easy
12
+ description: Detect whether the log is normal or an attack
13
+
14
+ - name: medium
15
+ description: Classify the type of attack
16
+
17
+ - name: hard
18
+ description: Predict category, severity, and mitigation action
19
+
20
+ action_space:
21
+ type: object
22
+ properties:
23
+ category:
24
+ type: string
25
+ enum: [brute_force, malware, phishing, dos_attack, normal]
26
+ description: Must match one of the allowed category labels.
27
+ severity:
28
+ type: string
29
+ enum: [low, medium, high]
30
+ description: Must match one of the allowed severity labels.
31
+ action:
32
+ type: string
33
+ description: A short, concrete mitigation action.
34
+
35
+ observation_space:
36
+ type: object
37
+ properties:
38
+ log:
39
+ type: string
40
+ system:
41
+ type: string
42
+ instructions:
43
+ type: string
44
+ description: Output contract for the agent.
45
+ allowed_categories:
46
+ type: array
47
+ items:
48
+ type: string
49
+ description: Allowed category labels for the response.
50
+ allowed_severities:
51
+ type: array
52
+ items:
53
+ type: string
54
+ description: Allowed severity labels for the response.
55
+ agent_prompt:
56
+ type: string
57
+ description: Ready-to-send prompt for an OpenAI agent.
58
+ response_example:
59
+ type: object
60
+ properties:
61
+ category:
62
+ type: string
63
+ severity:
64
+ type: string
65
+ action:
66
+ type: string
67
+
68
+ reward_range: [0, 1]
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastapi>=0.110,<1.0
2
+ uvicorn>=0.29,<1.0
test_grader.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from urllib import error, request
6
+ from urllib.parse import urlparse
7
+
8
+ from environment import SecurityEnv
9
+
10
+
11
+ ENV_FILE = Path(__file__).resolve().parent / ".env"
12
+ PREFERRED_MODELS = [
13
+ "llama-3.3-70b-versatile",
14
+ "groq/compound-mini",
15
+ "openai/gpt-oss-120b",
16
+ "llama-3.1-8b-instant",
17
+ "groq/compound",
18
+ "openai/gpt-oss-20b",
19
+ ]
20
+ NON_CHAT_MODEL_MARKERS = [
21
+ "whisper",
22
+ "tts",
23
+ "transcribe",
24
+ "transcription",
25
+ "speech",
26
+ "vision-preview",
27
+ ]
28
+ _RESOLVED_MODEL = None
29
+
30
+
31
+ def load_dotenv(dotenv_path):
32
+ if not dotenv_path.exists():
33
+ return
34
+
35
+ for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
36
+ line = raw_line.strip()
37
+ if not line or line.startswith("#") or "=" not in line:
38
+ continue
39
+
40
+ key, value = line.split("=", 1)
41
+ key = key.strip()
42
+ value = value.strip().strip("\"'")
43
+ os.environ.setdefault(key, value)
44
+
45
+
46
+ load_dotenv(ENV_FILE)
47
+
48
+
49
+ AGENT_CONFIG = {
50
+ "api_url": os.getenv("AGENT_API_URL", "https://api.openai.com/v1/chat/completions"),
51
+ "api_key": os.getenv("AGENT_API_KEY", ""),
52
+ "temperature": 0,
53
+ "max_tokens": 300,
54
+ "timeout_seconds": 60,
55
+ "send_feedback_to_agent": True,
56
+ "runs": 1,
57
+ "log_file": Path(__file__).resolve().parent / "agent_eval_log.jsonl",
58
+ }
59
+
60
+
61
+ def normalize_chat_completions_url(api_url):
62
+ normalized = api_url.rstrip("/")
63
+ if normalized.endswith("/chat/completions"):
64
+ return normalized
65
+ if normalized.endswith("/v1"):
66
+ return f"{normalized}/chat/completions"
67
+ if normalized.endswith("/openai/v1"):
68
+ return f"{normalized}/chat/completions"
69
+ return normalized
70
+
71
+
72
+ def build_models_url(api_url):
73
+ chat_url = normalize_chat_completions_url(api_url)
74
+ if chat_url.endswith("/chat/completions"):
75
+ return f"{chat_url[:-len('/chat/completions')]}/models"
76
+ return f"{chat_url}/models"
77
+
78
+
79
+ def get_provider_host():
80
+ return urlparse(normalize_chat_completions_url(AGENT_CONFIG["api_url"])).netloc.lower()
81
+
82
+
83
+ def build_headers():
84
+ headers = {
85
+ "Content-Type": "application/json",
86
+ "Accept": "application/json",
87
+ "User-Agent": "Meta-Hackathon-OpenEnv/1.0",
88
+ }
89
+ if AGENT_CONFIG["api_key"]:
90
+ headers["Authorization"] = f"Bearer {AGENT_CONFIG['api_key']}"
91
+ return headers
92
+
93
+
94
+ def is_chat_model(model_id):
95
+ lowered = model_id.lower()
96
+ return not any(marker in lowered for marker in NON_CHAT_MODEL_MARKERS)
97
+
98
+
99
+ def choose_model(model_items):
100
+ active_ids = [item["id"] for item in model_items if item.get("active")]
101
+ if not active_ids:
102
+ raise RuntimeError("The provider returned no active models.")
103
+
104
+ for model_id in PREFERRED_MODELS:
105
+ if model_id in active_ids:
106
+ return model_id
107
+
108
+ for model_id in active_ids:
109
+ if is_chat_model(model_id):
110
+ return model_id
111
+
112
+ return active_ids[0]
113
+
114
+
115
+ def resolve_model():
116
+ global _RESOLVED_MODEL
117
+ if _RESOLVED_MODEL:
118
+ return _RESOLVED_MODEL
119
+
120
+ models_url = build_models_url(AGENT_CONFIG["api_url"])
121
+ http_request = request.Request(
122
+ models_url,
123
+ headers=build_headers(),
124
+ method="GET",
125
+ )
126
+
127
+ try:
128
+ with request.urlopen(http_request, timeout=AGENT_CONFIG["timeout_seconds"]) as response:
129
+ payload = json.loads(response.read().decode("utf-8"))
130
+ except error.HTTPError as exc:
131
+ body = exc.read().decode("utf-8", errors="replace")
132
+ raise RuntimeError(f"Model discovery failed with HTTP {exc.code}: {body}") from exc
133
+ except error.URLError as exc:
134
+ raise RuntimeError(f"Could not reach model discovery endpoint: {exc}") from exc
135
+
136
+ _RESOLVED_MODEL = choose_model(payload.get("data", []))
137
+ return _RESOLVED_MODEL
138
+
139
+
140
+ def build_messages(observation):
141
+ return [
142
+ {
143
+ "role": "system",
144
+ "content": observation["instructions"],
145
+ },
146
+ {
147
+ "role": "user",
148
+ "content": (
149
+ f"System: {observation['system']}\n"
150
+ f"Log: {observation['log']}\n\n"
151
+ "Return only JSON with this shape:\n"
152
+ f"{json.dumps(observation['response_example'])}"
153
+ ),
154
+ },
155
+ ]
156
+
157
+
158
+ def build_feedback_message(observation, expected, predicted, score):
159
+ return {
160
+ "role": "user",
161
+ "content": (
162
+ "Your previous answer has been graded.\n"
163
+ f"Original system: {observation['system']}\n"
164
+ f"Original log: {observation['log']}\n"
165
+ f"Expected answer: {json.dumps(expected)}\n"
166
+ f"Your answer: {json.dumps(predicted)}\n"
167
+ f"Score: {score}\n\n"
168
+ "Use this feedback to improve future answers while keeping the exact same JSON-only format."
169
+ ),
170
+ }
171
+
172
+
173
+ def build_request_payload(messages):
174
+ payload = {
175
+ "model": resolve_model(),
176
+ "messages": messages,
177
+ "temperature": AGENT_CONFIG["temperature"],
178
+ }
179
+
180
+ if "groq.com" in get_provider_host():
181
+ payload["max_completion_tokens"] = AGENT_CONFIG["max_tokens"]
182
+ payload["response_format"] = {"type": "json_object"}
183
+ else:
184
+ payload["max_tokens"] = AGENT_CONFIG["max_tokens"]
185
+
186
+ return payload
187
+
188
+
189
+ def call_agent(messages):
190
+ payload = json.dumps(build_request_payload(messages)).encode("utf-8")
191
+ http_request = request.Request(
192
+ normalize_chat_completions_url(AGENT_CONFIG["api_url"]),
193
+ data=payload,
194
+ headers=build_headers(),
195
+ method="POST",
196
+ )
197
+
198
+ try:
199
+ with request.urlopen(
200
+ http_request,
201
+ timeout=AGENT_CONFIG["timeout_seconds"],
202
+ ) as response:
203
+ return json.loads(response.read().decode("utf-8"))
204
+ except error.HTTPError as exc:
205
+ body = exc.read().decode("utf-8", errors="replace")
206
+ raise RuntimeError(f"Agent API returned HTTP {exc.code}: {body}") from exc
207
+ except error.URLError as exc:
208
+ raise RuntimeError(f"Could not reach agent API: {exc}") from exc
209
+
210
+
211
+ def extract_response_text(response_json):
212
+ choices = response_json.get("choices") or []
213
+ if not choices:
214
+ raise RuntimeError(f"Agent response does not contain choices: {response_json}")
215
+
216
+ message = choices[0].get("message") or {}
217
+ content = message.get("content", "")
218
+
219
+ if isinstance(content, str):
220
+ return content.strip()
221
+
222
+ if isinstance(content, list):
223
+ text_parts = []
224
+ for item in content:
225
+ if isinstance(item, dict) and item.get("type") == "text":
226
+ text_parts.append(item.get("text", ""))
227
+ return "\n".join(text_parts).strip()
228
+
229
+ return str(content).strip()
230
+
231
+
232
+ def extract_first_json_block(text):
233
+ start_index = text.find("{")
234
+ if start_index == -1:
235
+ return None
236
+
237
+ depth = 0
238
+ for index in range(start_index, len(text)):
239
+ char = text[index]
240
+ if char == "{":
241
+ depth += 1
242
+ elif char == "}":
243
+ depth -= 1
244
+ if depth == 0:
245
+ return text[start_index:index + 1]
246
+
247
+ return None
248
+
249
+
250
+ def parse_prediction(raw_text):
251
+ cleaned = raw_text.strip()
252
+
253
+ try:
254
+ parsed = json.loads(cleaned)
255
+ if isinstance(parsed, dict):
256
+ return parsed, None
257
+ except json.JSONDecodeError:
258
+ pass
259
+
260
+ json_block = extract_first_json_block(cleaned)
261
+ if json_block:
262
+ try:
263
+ parsed = json.loads(json_block)
264
+ if isinstance(parsed, dict):
265
+ return parsed, None
266
+ except json.JSONDecodeError:
267
+ pass
268
+
269
+ fallback = {
270
+ "category": "",
271
+ "severity": "",
272
+ "action": cleaned,
273
+ }
274
+ return fallback, "Agent response was not valid JSON; using raw text as action."
275
+
276
+
277
+ def append_log(record):
278
+ log_file = AGENT_CONFIG["log_file"]
279
+ log_file.parent.mkdir(parents=True, exist_ok=True)
280
+ with log_file.open("a", encoding="utf-8") as handle:
281
+ handle.write(json.dumps(record, ensure_ascii=False) + "\n")
282
+
283
+
284
+ def validate_config():
285
+ if not AGENT_CONFIG["api_url"]:
286
+ raise RuntimeError("Set AGENT_CONFIG['api_url'] before running this script.")
287
+ if not AGENT_CONFIG["api_key"]:
288
+ raise RuntimeError("Set AGENT_API_KEY in .env before running this script.")
289
+
290
+
291
+ def run_single_evaluation():
292
+ env = SecurityEnv()
293
+ observation = env.reset()
294
+ expected = env.current_sample["expected"]
295
+ selected_model = resolve_model()
296
+
297
+ messages = build_messages(observation)
298
+ response_json = call_agent(messages)
299
+ raw_response = extract_response_text(response_json)
300
+ predicted, parse_warning = parse_prediction(raw_response)
301
+
302
+ result = env.step(predicted)
303
+ score = result["reward"]
304
+
305
+ feedback_response = None
306
+ if AGENT_CONFIG["send_feedback_to_agent"]:
307
+ feedback_messages = messages + [
308
+ {
309
+ "role": "assistant",
310
+ "content": raw_response,
311
+ },
312
+ build_feedback_message(observation, expected, predicted, score),
313
+ ]
314
+ feedback_json = call_agent(feedback_messages)
315
+ feedback_response = extract_response_text(feedback_json)
316
+
317
+ record = {
318
+ "timestamp": datetime.now(timezone.utc).isoformat(),
319
+ "query": {
320
+ "system": observation["system"],
321
+ "log": observation["log"],
322
+ },
323
+ "prompt": observation["agent_prompt"],
324
+ "expected": expected,
325
+ "predicted": predicted,
326
+ "raw_response": raw_response,
327
+ "score": score,
328
+ "parse_warning": parse_warning,
329
+ "feedback_response": feedback_response,
330
+ "provider": get_provider_host(),
331
+ "model_used": selected_model,
332
+ }
333
+ append_log(record)
334
+ return record
335
+
336
+
337
+ def main():
338
+ validate_config()
339
+
340
+ for run_number in range(1, AGENT_CONFIG["runs"] + 1):
341
+ record = run_single_evaluation()
342
+ print(f"Run {run_number}")
343
+ print(f"Provider: {record['provider']}")
344
+ print(f"Model used: {record['model_used']}")
345
+ print(f"Query system: {record['query']['system']}")
346
+ print(f"Query log: {record['query']['log']}")
347
+ print(f"Expected: {json.dumps(record['expected'])}")
348
+ print(f"Predicted: {json.dumps(record['predicted'])}")
349
+ print(f"Score: {record['score']}")
350
+ print(f"Log file: {AGENT_CONFIG['log_file']}")
351
+ if record["parse_warning"]:
352
+ print(f"Warning: {record['parse_warning']}")
353
+ if record["feedback_response"]:
354
+ print(f"Feedback response: {record['feedback_response']}")
355
+ print("-" * 60)
356
+
357
+
358
+ if __name__ == "__main__":
359
+ main()