henryz2004 commited on
Commit
c54d733
·
1 Parent(s): de0ab42

transfers gemma steering code

Browse files
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  # PyCharm
 
2
  .idea/
3
 
4
  # Byte-compiled / optimized / DLL files
 
1
  # PyCharm
2
+ scratchpad.py
3
  .idea/
4
 
5
  # Byte-compiled / optimized / DLL files
neuroscope/gemma_steering.ipynb ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "ExecuteTime": {
6
+ "end_time": "2024-11-17T02:47:57.737311Z",
7
+ "start_time": "2024-11-17T02:47:57.732721Z"
8
+ }
9
+ },
10
+ "cell_type": "code",
11
+ "source": [
12
+ "import os\n",
13
+ "import torch\n",
14
+ "from prometheus_client.decorator import contextmanager\n",
15
+ "from tqdm import tqdm\n",
16
+ "import plotly.express as px\n",
17
+ "from datasets import load_dataset\n",
18
+ "from transformer_lens import HookedTransformer, utils\n",
19
+ "from functools import partial\n",
20
+ "from sae_lens import SAE\n",
21
+ "from contextlib import contextmanager\n",
22
+ "device = \"cuda\"\n"
23
+ ],
24
+ "id": "bf4ae592223778e4",
25
+ "outputs": [],
26
+ "execution_count": 44
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "id": "initial_id",
31
+ "metadata": {
32
+ "collapsed": true,
33
+ "ExecuteTime": {
34
+ "end_time": "2024-11-17T02:23:27.822011Z",
35
+ "start_time": "2024-11-17T02:23:26.967681Z"
36
+ }
37
+ },
38
+ "source": [
39
+ "from sae_lens import SAE # pip install sae-lens\n",
40
+ "\n",
41
+ "sae, cfg_dict, sparsity = SAE.from_pretrained(\n",
42
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
43
+ " sae_id = \"layer_20/width_16k/canonical\",\n",
44
+ " device=device\n",
45
+ ")"
46
+ ],
47
+ "outputs": [],
48
+ "execution_count": 24
49
+ },
50
+ {
51
+ "metadata": {
52
+ "ExecuteTime": {
53
+ "end_time": "2024-11-17T02:42:07.118459Z",
54
+ "start_time": "2024-11-17T02:41:35.462583Z"
55
+ }
56
+ },
57
+ "cell_type": "code",
58
+ "source": [
59
+ "sae_10, _, _ = SAE.from_pretrained(\n",
60
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
61
+ " sae_id = \"layer_10/width_16k/canonical\",\n",
62
+ " device=device\n",
63
+ ")"
64
+ ],
65
+ "id": "89b57ad3a6b39592",
66
+ "outputs": [
67
+ {
68
+ "data": {
69
+ "text/plain": [
70
+ "params.npz: 0%| | 0.00/302M [00:00<?, ?B/s]"
71
+ ],
72
+ "application/vnd.jupyter.widget-view+json": {
73
+ "version_major": 2,
74
+ "version_minor": 0,
75
+ "model_id": "6a8afdc8c5924d7380ea41024733c0fc"
76
+ }
77
+ },
78
+ "metadata": {},
79
+ "output_type": "display_data"
80
+ }
81
+ ],
82
+ "execution_count": 33
83
+ },
84
+ {
85
+ "metadata": {
86
+ "ExecuteTime": {
87
+ "end_time": "2024-11-17T02:42:51.122647Z",
88
+ "start_time": "2024-11-17T02:42:19.528684Z"
89
+ }
90
+ },
91
+ "cell_type": "code",
92
+ "source": [
93
+ "sae_4, _, _ = SAE.from_pretrained(\n",
94
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
95
+ " sae_id = \"layer_4/width_16k/canonical\",\n",
96
+ " device=device\n",
97
+ ")"
98
+ ],
99
+ "id": "b47f91f033e06cbe",
100
+ "outputs": [
101
+ {
102
+ "data": {
103
+ "text/plain": [
104
+ "params.npz: 0%| | 0.00/302M [00:00<?, ?B/s]"
105
+ ],
106
+ "application/vnd.jupyter.widget-view+json": {
107
+ "version_major": 2,
108
+ "version_minor": 0,
109
+ "model_id": "1b38df5a681744918186c05839b569d3"
110
+ }
111
+ },
112
+ "metadata": {},
113
+ "output_type": "display_data"
114
+ }
115
+ ],
116
+ "execution_count": 34
117
+ },
118
+ {
119
+ "metadata": {
120
+ "ExecuteTime": {
121
+ "end_time": "2024-11-17T02:01:18.473122Z",
122
+ "start_time": "2024-11-17T02:00:54.203629Z"
123
+ }
124
+ },
125
+ "cell_type": "code",
126
+ "source": [
127
+ "model = HookedTransformer.from_pretrained_no_processing(\n",
128
+ " model_name=\"google/gemma-2-2b-it\",\n",
129
+ " device=device,\n",
130
+ " dtype=torch.bfloat16,\n",
131
+ " default_padding_side=\"left\"\n",
132
+ ")\n",
133
+ "layer = 20"
134
+ ],
135
+ "id": "cd7f2e4944bfaf94",
136
+ "outputs": [
137
+ {
138
+ "data": {
139
+ "text/plain": [
140
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
141
+ ],
142
+ "application/vnd.jupyter.widget-view+json": {
143
+ "version_major": 2,
144
+ "version_minor": 0,
145
+ "model_id": "f5a06cc7fd504f79bd1cd86974cf9110"
146
+ }
147
+ },
148
+ "metadata": {},
149
+ "output_type": "display_data"
150
+ },
151
+ {
152
+ "name": "stdout",
153
+ "output_type": "stream",
154
+ "text": [
155
+ "Loaded pretrained model google/gemma-2-2b-it into HookedTransformer\n"
156
+ ]
157
+ }
158
+ ],
159
+ "execution_count": 6
160
+ },
161
+ {
162
+ "metadata": {
163
+ "ExecuteTime": {
164
+ "end_time": "2024-11-17T02:23:34.599734Z",
165
+ "start_time": "2024-11-17T02:23:34.583687Z"
166
+ }
167
+ },
168
+ "cell_type": "code",
169
+ "source": "sae.eval()",
170
+ "id": "64acbbc3b4befc24",
171
+ "outputs": [],
172
+ "execution_count": 25
173
+ },
174
+ {
175
+ "metadata": {
176
+ "ExecuteTime": {
177
+ "end_time": "2024-11-17T02:43:49.323105Z",
178
+ "start_time": "2024-11-17T02:43:49.307082Z"
179
+ }
180
+ },
181
+ "cell_type": "code",
182
+ "source": [
183
+ "feature_dict = {\n",
184
+ " \"dog\": {\n",
185
+ " \"sae\": sae,\n",
186
+ " \"index\": 12082\n",
187
+ " },\n",
188
+ " \"harry potter4\": {\n",
189
+ " \"sae\": sae_4,\n",
190
+ " \"index\": 12445\n",
191
+ " },\n",
192
+ " \"harry potter10\": {\n",
193
+ " \"sae\": sae_10,\n",
194
+ " \"index\": 6520\n",
195
+ " }\n",
196
+ "}"
197
+ ],
198
+ "id": "e2554e692e456e54",
199
+ "outputs": [],
200
+ "execution_count": 35
201
+ },
202
+ {
203
+ "metadata": {
204
+ "ExecuteTime": {
205
+ "end_time": "2024-11-17T02:04:44.718423Z",
206
+ "start_time": "2024-11-17T02:04:44.695385Z"
207
+ }
208
+ },
209
+ "cell_type": "code",
210
+ "source": "cfg_dict",
211
+ "id": "e732fd83c9d423ab",
212
+ "outputs": [
213
+ {
214
+ "data": {
215
+ "text/plain": [
216
+ "{'architecture': 'jumprelu',\n",
217
+ " 'd_in': 2304,\n",
218
+ " 'd_sae': 16384,\n",
219
+ " 'dtype': 'float32',\n",
220
+ " 'model_name': 'gemma-2-2b',\n",
221
+ " 'hook_name': 'blocks.20.hook_resid_post',\n",
222
+ " 'hook_layer': 20,\n",
223
+ " 'hook_head_index': None,\n",
224
+ " 'activation_fn_str': 'relu',\n",
225
+ " 'finetuning_scaling_factor': False,\n",
226
+ " 'sae_lens_training_version': None,\n",
227
+ " 'prepend_bos': True,\n",
228
+ " 'dataset_path': 'monology/pile-uncopyrighted',\n",
229
+ " 'context_size': 1024,\n",
230
+ " 'dataset_trust_remote_code': True,\n",
231
+ " 'apply_b_dec_to_input': False,\n",
232
+ " 'normalize_activations': None,\n",
233
+ " 'device': 'cpu',\n",
234
+ " 'neuronpedia_id': 'gemma-2-2b/20-gemmascope-res-16k'}"
235
+ ]
236
+ },
237
+ "execution_count": 11,
238
+ "metadata": {},
239
+ "output_type": "execute_result"
240
+ }
241
+ ],
242
+ "execution_count": 11
243
+ },
244
+ {
245
+ "metadata": {
246
+ "ExecuteTime": {
247
+ "end_time": "2024-11-17T02:44:27.983353Z",
248
+ "start_time": "2024-11-17T02:44:27.967271Z"
249
+ }
250
+ },
251
+ "cell_type": "code",
252
+ "source": [
253
+ "def sae_hook(activation, hook, subject, strength):\n",
254
+ " feature = feature_dict[subject]\n",
255
+ " steering_vector = feature[\"sae\"].W_dec[feature[\"index\"]] * strength\n",
256
+ " return activation + steering_vector"
257
+ ],
258
+ "id": "4435ef79496af25f",
259
+ "outputs": [],
260
+ "execution_count": 36
261
+ },
262
+ {
263
+ "metadata": {
264
+ "ExecuteTime": {
265
+ "end_time": "2024-11-17T02:49:18.312086Z",
266
+ "start_time": "2024-11-17T02:49:18.304525Z"
267
+ }
268
+ },
269
+ "cell_type": "code",
270
+ "source": [
271
+ "@contextmanager\n",
272
+ "def steering(subject, strength):\n",
273
+ " \n",
274
+ " layers = list(range(model.cfg.n_layers))\n",
275
+ " for layer in layers:\n",
276
+ " \n",
277
+ " model.add_hook(\n",
278
+ " utils.get_act_name('resid_pre', layer),\n",
279
+ " partial(sae_hook, subject=subject, strength=strength)\n",
280
+ " )\n",
281
+ " \n",
282
+ " yield \n",
283
+ " \n",
284
+ " model.reset_hooks()"
285
+ ],
286
+ "id": "f1437d28b12dcec5",
287
+ "outputs": [],
288
+ "execution_count": 48
289
+ },
290
+ {
291
+ "metadata": {
292
+ "ExecuteTime": {
293
+ "end_time": "2024-11-17T02:58:43.694747Z",
294
+ "start_time": "2024-11-17T02:58:43.682750Z"
295
+ }
296
+ },
297
+ "cell_type": "code",
298
+ "source": [
299
+ "batched_chat = [\n",
300
+ " [\n",
301
+ " {\"role\": \"user\",\n",
302
+ " \"content\": \"What book is Hermione from?\"}\n",
303
+ " ]\n",
304
+ "]"
305
+ ],
306
+ "id": "b20346b1d58f362a",
307
+ "outputs": [],
308
+ "execution_count": 54
309
+ },
310
+ {
311
+ "metadata": {
312
+ "ExecuteTime": {
313
+ "end_time": "2024-11-17T02:59:07.855305Z",
314
+ "start_time": "2024-11-17T02:58:52.070837Z"
315
+ }
316
+ },
317
+ "cell_type": "code",
318
+ "source": [
319
+ "tokens = model.tokenizer.apply_chat_template(\n",
320
+ " batched_chat,\n",
321
+ " padding=True,\n",
322
+ " tokenize=True,\n",
323
+ " return_tensors=\"pt\"\n",
324
+ ")\n",
325
+ "print(tokens)\n",
326
+ "\n",
327
+ "for i in range(2):\n",
328
+ " if i == 0:\n",
329
+ " print(\"steering\")\n",
330
+ " with steering(subject=\"harry potter10\", strength=-5):\n",
331
+ " with torch.set_grad_enabled(False):\n",
332
+ " batch_output = model.generate(tokens, max_new_tokens=256)\n",
333
+ " response_tokens = []\n",
334
+ " for prompt, combined in zip(tokens, batch_output):\n",
335
+ " response = combined[len(prompt):]\n",
336
+ " response_tokens.append(response)\n",
337
+ " \n",
338
+ " responses = model.tokenizer.batch_decode(response_tokens, skip_special_tokens=True)\n",
339
+ " \n",
340
+ " else:\n",
341
+ " print(\"no steering\")\n",
342
+ " with torch.set_grad_enabled(False):\n",
343
+ " batch_output = model.generate(tokens, max_new_tokens=256)\n",
344
+ " response_tokens = []\n",
345
+ " for prompt, combined in zip(tokens, batch_output):\n",
346
+ " response = combined[len(prompt):]\n",
347
+ " response_tokens.append(response)\n",
348
+ " \n",
349
+ " responses = model.tokenizer.batch_decode(response_tokens, skip_special_tokens=True)\n",
350
+ " \n",
351
+ " print(responses[0])"
352
+ ],
353
+ "id": "4bbe45c9e9ba8c2f",
354
+ "outputs": [
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "tensor([[ 2, 106, 1645, 108, 1841, 2870, 603, 105786, 774,\n",
360
+ " 235336, 107, 108]])\n",
361
+ "steering\n"
362
+ ]
363
+ },
364
+ {
365
+ "data": {
366
+ "text/plain": [
367
+ " 0%| | 0/256 [00:00<?, ?it/s]"
368
+ ],
369
+ "application/vnd.jupyter.widget-view+json": {
370
+ "version_major": 2,
371
+ "version_minor": 0,
372
+ "model_id": "e96b33a6215748bdaa450e642053af70"
373
+ }
374
+ },
375
+ "metadata": {},
376
+ "output_type": "display_data"
377
+ },
378
+ {
379
+ "name": "stdout",
380
+ "output_type": "stream",
381
+ "text": [
382
+ "\"The Real Housewives of Atlanta\" \n",
383
+ "*not applicable to location she's from. \n",
384
+ " \n",
385
+ " \n",
386
+ "\n",
387
+ "\n",
388
+ "\n",
389
+ "**answer:**\n",
390
+ "\n",
391
+ "The question is based on a misconception. \"9-1-1\" is more likely to be an answer \n",
392
+ "**Validation:**\n",
393
+ "\n",
394
+ "\"9-1-1\" is indeed a common answer given the real-world location context. \n",
395
+ "**Explanation:**\n",
396
+ "\n",
397
+ "\"9-1-1\" is usually the number corresponding to emergency services in most US cities. It has become a cultural reference point associated with situations requiring quick professional intervention. \n",
398
+ "\n",
399
+ "\n",
400
+ "\n",
401
+ "Let me know if you have any other questions! \n",
402
+ "\n",
403
+ "no steering\n"
404
+ ]
405
+ },
406
+ {
407
+ "data": {
408
+ "text/plain": [
409
+ " 0%| | 0/256 [00:00<?, ?it/s]"
410
+ ],
411
+ "application/vnd.jupyter.widget-view+json": {
412
+ "version_major": 2,
413
+ "version_minor": 0,
414
+ "model_id": "5ac5363f9bbb436bad94ba4bcdba90ec"
415
+ }
416
+ },
417
+ "metadata": {},
418
+ "output_type": "display_data"
419
+ },
420
+ {
421
+ "name": "stdout",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "* Answer: * **Harry Potter** \n",
425
+ "\n",
426
+ "Hermione is a fictional character from the Harry Potter series written by J.K. Rowling. \n",
427
+ "\n"
428
+ ]
429
+ }
430
+ ],
431
+ "execution_count": 56
432
+ },
433
+ {
434
+ "metadata": {
435
+ "ExecuteTime": {
436
+ "end_time": "2024-11-17T02:09:22.419382Z",
437
+ "start_time": "2024-11-17T02:09:22.412373Z"
438
+ }
439
+ },
440
+ "cell_type": "code",
441
+ "source": "model.hook_points",
442
+ "id": "83b3b036483d0968",
443
+ "outputs": [
444
+ {
445
+ "data": {
446
+ "text/plain": [
447
+ "<bound method HookedRootModule.hook_points of HookedTransformer(\n",
448
+ " (embed): Embed()\n",
449
+ " (hook_embed): HookPoint()\n",
450
+ " (blocks): ModuleList(\n",
451
+ " (0-25): 26 x TransformerBlock(\n",
452
+ " (ln1): RMSNorm(\n",
453
+ " (hook_scale): HookPoint()\n",
454
+ " (hook_normalized): HookPoint()\n",
455
+ " )\n",
456
+ " (ln1_post): RMSNorm(\n",
457
+ " (hook_scale): HookPoint()\n",
458
+ " (hook_normalized): HookPoint()\n",
459
+ " )\n",
460
+ " (ln2): RMSNorm(\n",
461
+ " (hook_scale): HookPoint()\n",
462
+ " (hook_normalized): HookPoint()\n",
463
+ " )\n",
464
+ " (ln2_post): RMSNorm(\n",
465
+ " (hook_scale): HookPoint()\n",
466
+ " (hook_normalized): HookPoint()\n",
467
+ " )\n",
468
+ " (attn): GroupedQueryAttention(\n",
469
+ " (hook_k): HookPoint()\n",
470
+ " (hook_q): HookPoint()\n",
471
+ " (hook_v): HookPoint()\n",
472
+ " (hook_z): HookPoint()\n",
473
+ " (hook_attn_scores): HookPoint()\n",
474
+ " (hook_pattern): HookPoint()\n",
475
+ " (hook_result): HookPoint()\n",
476
+ " (hook_rot_k): HookPoint()\n",
477
+ " (hook_rot_q): HookPoint()\n",
478
+ " )\n",
479
+ " (mlp): GatedMLP(\n",
480
+ " (hook_pre): HookPoint()\n",
481
+ " (hook_pre_linear): HookPoint()\n",
482
+ " (hook_post): HookPoint()\n",
483
+ " )\n",
484
+ " (hook_attn_in): HookPoint()\n",
485
+ " (hook_q_input): HookPoint()\n",
486
+ " (hook_k_input): HookPoint()\n",
487
+ " (hook_v_input): HookPoint()\n",
488
+ " (hook_mlp_in): HookPoint()\n",
489
+ " (hook_attn_out): HookPoint()\n",
490
+ " (hook_mlp_out): HookPoint()\n",
491
+ " (hook_resid_pre): HookPoint()\n",
492
+ " (hook_resid_mid): HookPoint()\n",
493
+ " (hook_resid_post): HookPoint()\n",
494
+ " )\n",
495
+ " )\n",
496
+ " (ln_final): RMSNorm(\n",
497
+ " (hook_scale): HookPoint()\n",
498
+ " (hook_normalized): HookPoint()\n",
499
+ " )\n",
500
+ " (unembed): Unembed()\n",
501
+ ")>"
502
+ ]
503
+ },
504
+ "execution_count": 16,
505
+ "metadata": {},
506
+ "output_type": "execute_result"
507
+ }
508
+ ],
509
+ "execution_count": 16
510
+ },
511
+ {
512
+ "metadata": {},
513
+ "cell_type": "code",
514
+ "outputs": [],
515
+ "execution_count": null,
516
+ "source": "",
517
+ "id": "1de277969b9b02c4"
518
+ }
519
+ ],
520
+ "metadata": {
521
+ "kernelspec": {
522
+ "display_name": "Python 3",
523
+ "language": "python",
524
+ "name": "python3"
525
+ },
526
+ "language_info": {
527
+ "codemirror_mode": {
528
+ "name": "ipython",
529
+ "version": 2
530
+ },
531
+ "file_extension": ".py",
532
+ "mimetype": "text/x-python",
533
+ "name": "python",
534
+ "nbconvert_exporter": "python",
535
+ "pygments_lexer": "ipython2",
536
+ "version": "2.7.6"
537
+ }
538
+ },
539
+ "nbformat": 4,
540
+ "nbformat_minor": 5
541
+ }
neuroscope/nnsight_gemma_steering.ipynb ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2024-11-17T06:56:30.804145Z",
10
+ "start_time": "2024-11-17T06:56:21.834289Z"
11
+ }
12
+ },
13
+ "source": [
14
+ "from functools import partial\n",
15
+ "from contextlib import contextmanager\n",
16
+ "\n",
17
+ "from nnsight import LanguageModel\n",
18
+ "import torch\n",
19
+ "#from transformer_lens import HookedTransformer, utils \n",
20
+ "\n",
21
+ "from sae_lens import SAE\n",
22
+ "\n",
23
+ "device = \"cuda\""
24
+ ],
25
+ "outputs": [],
26
+ "execution_count": 1
27
+ },
28
+ {
29
+ "metadata": {
30
+ "ExecuteTime": {
31
+ "end_time": "2024-11-17T06:56:33.679473Z",
32
+ "start_time": "2024-11-17T06:56:30.804145Z"
33
+ }
34
+ },
35
+ "cell_type": "code",
36
+ "source": [
37
+ "sae_20, _, _ = SAE.from_pretrained(\n",
38
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
39
+ " sae_id = \"layer_20/width_16k/canonical\",\n",
40
+ " device=device\n",
41
+ ")\n",
42
+ "sae_10, _, _ = SAE.from_pretrained(\n",
43
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
44
+ " sae_id = \"layer_10/width_16k/canonical\",\n",
45
+ " device=device\n",
46
+ ")\n",
47
+ "\n",
48
+ "sae_4, _, _ = SAE.from_pretrained(\n",
49
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
50
+ " sae_id = \"layer_4/width_16k/canonical\",\n",
51
+ " device=device\n",
52
+ ")"
53
+ ],
54
+ "id": "7f7ce71b9fef6b6b",
55
+ "outputs": [],
56
+ "execution_count": 2
57
+ },
58
+ {
59
+ "metadata": {
60
+ "ExecuteTime": {
61
+ "end_time": "2024-11-17T06:56:34.288293Z",
62
+ "start_time": "2024-11-17T06:56:33.872269Z"
63
+ }
64
+ },
65
+ "cell_type": "code",
66
+ "source": [
67
+ "sae_25, _, _ = SAE.from_pretrained(\n",
68
+ " release = \"gemma-scope-2b-pt-res-canonical\",\n",
69
+ " sae_id = \"layer_25/width_16k/canonical\",\n",
70
+ " device=device\n",
71
+ ")"
72
+ ],
73
+ "id": "4d491284b20f1b80",
74
+ "outputs": [],
75
+ "execution_count": 3
76
+ },
77
+ {
78
+ "metadata": {
79
+ "ExecuteTime": {
80
+ "end_time": "2024-11-17T06:56:34.311745Z",
81
+ "start_time": "2024-11-17T06:56:34.300293Z"
82
+ }
83
+ },
84
+ "cell_type": "code",
85
+ "source": [
86
+ "feature_dict = {\n",
87
+ " \"dog\": {\n",
88
+ " \"sae\": sae_20,\n",
89
+ " \"index\": 12082\n",
90
+ " },\n",
91
+ " \"harry potter4\": {\n",
92
+ " \"sae\": sae_4,\n",
93
+ " \"index\": 12445\n",
94
+ " },\n",
95
+ " \"harry potter10\": {\n",
96
+ " \"sae\": sae_10,\n",
97
+ " \"index\": 6520\n",
98
+ " },\n",
99
+ " \"kindness\": {\n",
100
+ " \"sae\": sae_25,\n",
101
+ " \"index\": 10092\n",
102
+ " },\n",
103
+ " \"yelling\": {\n",
104
+ " \"sae\": sae_20,\n",
105
+ " \"index\": 11859\n",
106
+ " }\n",
107
+ "}"
108
+ ],
109
+ "id": "28cfeda14258b526",
110
+ "outputs": [],
111
+ "execution_count": 4
112
+ },
113
+ {
114
+ "metadata": {
115
+ "ExecuteTime": {
116
+ "end_time": "2024-11-17T06:56:35.228585Z",
117
+ "start_time": "2024-11-17T06:56:34.321853Z"
118
+ }
119
+ },
120
+ "cell_type": "code",
121
+ "source": [
122
+ "llm = LanguageModel(\n",
123
+ " \"google/gemma-2-2b-it\", \n",
124
+ " # dtype=torch.bfloat16,\n",
125
+ " # default_padding_side=\"left\",\n",
126
+ " device_map=\"cuda:0\"\n",
127
+ ")\n",
128
+ "# \"meta-llama/Llama-3.2-1B-Instruct\",#"
129
+ ],
130
+ "id": "998c910d46fffe7a",
131
+ "outputs": [
132
+ {
133
+ "data": {
134
+ "text/plain": [
135
+ "Gemma2ForCausalLM(\n",
136
+ " (model): Gemma2Model(\n",
137
+ " (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
138
+ " (layers): ModuleList(\n",
139
+ " (0-25): 26 x Gemma2DecoderLayer(\n",
140
+ " (self_attn): Gemma2Attention(\n",
141
+ " (q_proj): Linear(in_features=2304, out_features=2048, bias=False)\n",
142
+ " (k_proj): Linear(in_features=2304, out_features=1024, bias=False)\n",
143
+ " (v_proj): Linear(in_features=2304, out_features=1024, bias=False)\n",
144
+ " (o_proj): Linear(in_features=2048, out_features=2304, bias=False)\n",
145
+ " (rotary_emb): Gemma2RotaryEmbedding()\n",
146
+ " )\n",
147
+ " (mlp): Gemma2MLP(\n",
148
+ " (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)\n",
149
+ " (up_proj): Linear(in_features=2304, out_features=9216, bias=False)\n",
150
+ " (down_proj): Linear(in_features=9216, out_features=2304, bias=False)\n",
151
+ " (act_fn): PytorchGELUTanh()\n",
152
+ " )\n",
153
+ " (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
154
+ " (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
155
+ " (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
156
+ " (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
157
+ " )\n",
158
+ " )\n",
159
+ " (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
160
+ " )\n",
161
+ " (lm_head): Linear(in_features=2304, out_features=256000, bias=False)\n",
162
+ " (generator): WrapperModule()\n",
163
+ ")"
164
+ ]
165
+ },
166
+ "execution_count": 5,
167
+ "metadata": {},
168
+ "output_type": "execute_result"
169
+ }
170
+ ],
171
+ "execution_count": 5
172
+ },
173
+ {
174
+ "metadata": {
175
+ "ExecuteTime": {
176
+ "end_time": "2024-11-17T06:56:35.268613Z",
177
+ "start_time": "2024-11-17T06:56:35.248618Z"
178
+ }
179
+ },
180
+ "cell_type": "code",
181
+ "source": "len(llm.model.layers)",
182
+ "id": "466a5bd33995eaa6",
183
+ "outputs": [
184
+ {
185
+ "data": {
186
+ "text/plain": [
187
+ "26"
188
+ ]
189
+ },
190
+ "execution_count": 6,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "execution_count": 6
196
+ },
197
+ {
198
+ "metadata": {
199
+ "ExecuteTime": {
200
+ "end_time": "2024-11-17T07:26:43.177202Z",
201
+ "start_time": "2024-11-17T07:26:43.167072Z"
202
+ }
203
+ },
204
+ "cell_type": "code",
205
+ "source": [
206
+ "batched_chat = [\n",
207
+ " [\n",
208
+ " {\"role\": \"user\",\n",
209
+ " \"content\": \"What book is Hermione Granger from?\"}\n",
210
+ " ]\n",
211
+ "]"
212
+ ],
213
+ "id": "7178e1930f1cc17f",
214
+ "outputs": [],
215
+ "execution_count": 126
216
+ },
217
+ {
218
+ "metadata": {
219
+ "ExecuteTime": {
220
+ "end_time": "2024-11-17T07:26:43.342263Z",
221
+ "start_time": "2024-11-17T07:26:43.327752Z"
222
+ }
223
+ },
224
+ "cell_type": "code",
225
+ "source": [
226
+ "tokens = llm.tokenizer.apply_chat_template(batched_chat,\n",
227
+ " padding=True,\n",
228
+ " tokenize=True,\n",
229
+ " return_tensors=\"pt\",\n",
230
+ " add_generation_prompt=True\n",
231
+ ")\n",
232
+ "tokens"
233
+ ],
234
+ "id": "70392d25051117a9",
235
+ "outputs": [
236
+ {
237
+ "data": {
238
+ "text/plain": [
239
+ "tensor([[ 2, 106, 1645, 108, 1841, 2870, 603, 105786, 125492,\n",
240
+ " 774, 235336, 107, 108, 106, 2516, 108]])"
241
+ ]
242
+ },
243
+ "execution_count": 127,
244
+ "metadata": {},
245
+ "output_type": "execute_result"
246
+ }
247
+ ],
248
+ "execution_count": 127
249
+ },
250
+ {
251
+ "metadata": {
252
+ "ExecuteTime": {
253
+ "end_time": "2024-11-17T07:26:53.366208Z",
254
+ "start_time": "2024-11-17T07:26:53.352196Z"
255
+ }
256
+ },
257
+ "cell_type": "code",
258
+ "source": [
259
+ "feature = feature_dict[\"harry potter4\"]\n",
260
+ "strength = -25\n",
261
+ "steering_vector = feature[\"sae\"].W_dec[feature[\"index\"]] * strength"
262
+ ],
263
+ "id": "603bf4dc89e7cfc8",
264
+ "outputs": [],
265
+ "execution_count": 131
266
+ },
267
+ {
268
+ "metadata": {
269
+ "ExecuteTime": {
270
+ "end_time": "2024-11-17T07:26:53.587779Z",
271
+ "start_time": "2024-11-17T07:26:53.572082Z"
272
+ }
273
+ },
274
+ "cell_type": "code",
275
+ "source": "steering_vector",
276
+ "id": "8a3dd6b322f460ff",
277
+ "outputs": [
278
+ {
279
+ "data": {
280
+ "text/plain": [
281
+ "tensor([-0.9424, -0.1070, 0.5881, ..., 0.1192, 0.8251, 0.2128],\n",
282
+ " device='cuda:0', grad_fn=<MulBackward0>)"
283
+ ]
284
+ },
285
+ "execution_count": 132,
286
+ "metadata": {},
287
+ "output_type": "execute_result"
288
+ }
289
+ ],
290
+ "execution_count": 132
291
+ },
292
+ {
293
+ "metadata": {},
294
+ "cell_type": "markdown",
295
+ "source": [
296
+ "- (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
297
+ "- (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
298
+ "- (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
299
+ "- (post_attention_layernorm): Gem"
300
+ ],
301
+ "id": "d95ae1ab36f2bb8f"
302
+ },
303
+ {
304
+ "metadata": {
305
+ "ExecuteTime": {
306
+ "end_time": "2024-11-17T07:27:13.573193Z",
307
+ "start_time": "2024-11-17T07:27:07.049580Z"
308
+ }
309
+ },
310
+ "cell_type": "code",
311
+ "source": [
312
+ "with llm.generate(tokens, temperature=1, max_new_tokens=128) as tracer:\n",
313
+ " \n",
314
+ " for i in range(len(llm.model.layers)):\n",
315
+ "\n",
316
+ " module_name = \"post_attention_layernorm\"\n",
317
+ " module = getattr(llm.model.layers[i], module_name)\n",
318
+ "\n",
319
+ " resid_pre_before = module.output.clone().save()\n",
320
+ " module.output[:] = resid_pre_before + steering_vector\n",
321
+ " \n",
322
+ " resid_pre_after = module.output.save()\n",
323
+ " \n",
324
+ " # module.next()\n",
325
+ " \n",
326
+ " output = llm.generator.output.save()\n",
327
+ " \n",
328
+ "# print(\"output tensors:\", output)\n",
329
+ "print(\"output string:\", llm.tokenizer.batch_decode(output.tolist(), skip_special_tokens=False)[0])\n",
330
+ "# print(\"Before:\", resid_pre_before)\n",
331
+ "# print(\"After:\", resid_pre_after)"
332
+ ],
333
+ "id": "b990a57221675d1b",
334
+ "outputs": [
335
+ {
336
+ "name": "stdout",
337
+ "output_type": "stream",
338
+ "text": [
339
+ "output string: <bos><start_of_turn>user\n",
340
+ "What book is Hermione Granger from?<end_of_turn>\n",
341
+ "<start_of_turn>model\n",
342
+ ". \n",
343
+ " \n",
344
+ " **The following are some of the key features of the game:**\n",
345
+ "\n",
346
+ "* **Multiplayer Mode:** You can play against friends or other players online.\n",
347
+ "* **Customization:** Customize your character's appearance and abilities.\n",
348
+ "* **Story Mode:** A compelling narrative that unfolds as you progress through the game.\n",
349
+ "* **Multiple Game Modes:** Choose from a variety of game modes, including competitive and cooperative.\n",
350
+ "* **Regular Updates:** The game is regularly updated with new content and features.\n",
351
+ "\n",
352
+ "\n",
353
+ "**Overall, the game is a fun and engaging experience that is sure to keep you entertained for\n"
354
+ ]
355
+ }
356
+ ],
357
+ "execution_count": 134
358
+ },
359
+ {
360
+ "metadata": {
361
+ "ExecuteTime": {
362
+ "end_time": "2024-11-17T06:56:44.790978Z",
363
+ "start_time": "2024-11-17T06:56:44.781384Z"
364
+ }
365
+ },
366
+ "cell_type": "code",
367
+ "source": "",
368
+ "id": "3d15c37787a92ab2",
369
+ "outputs": [],
370
+ "execution_count": null
371
+ }
372
+ ],
373
+ "metadata": {
374
+ "kernelspec": {
375
+ "display_name": "Python 3",
376
+ "language": "python",
377
+ "name": "python3"
378
+ },
379
+ "language_info": {
380
+ "codemirror_mode": {
381
+ "name": "ipython",
382
+ "version": 2
383
+ },
384
+ "file_extension": ".py",
385
+ "mimetype": "text/x-python",
386
+ "name": "python",
387
+ "nbconvert_exporter": "python",
388
+ "pygments_lexer": "ipython2",
389
+ "version": "2.7.6"
390
+ }
391
+ },
392
+ "nbformat": 4,
393
+ "nbformat_minor": 5
394
+ }
neuroscope/sae_tutorial.ipynb ADDED
@@ -0,0 +1,1781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "ExecuteTime": {
6
+ "end_time": "2024-11-17T01:27:46.664569Z",
7
+ "start_time": "2024-11-17T01:26:59.191804Z"
8
+ }
9
+ },
10
+ "cell_type": "code",
11
+ "source": [
12
+ "# from sae_lens import SAE # pip install sae-lens\n",
13
+ "# \n",
14
+ "# sae, cfg_dict, sparsity = SAE.from_pretrained(\n",
15
+ "# release = \"gemma-scope-2b-pt-res-canonical\",\n",
16
+ "# sae_id = \"layer_20/width_16k/canonical\",\n",
17
+ "# )"
18
+ ],
19
+ "id": "a9ebb2c22e1c27ac",
20
+ "outputs": [
21
+ {
22
+ "name": "stderr",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "C:\\Users\\henry\\anaconda3\\envs\\dialignment\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
26
+ " from .autonotebook import tqdm as notebook_tqdm\n"
27
+ ]
28
+ }
29
+ ],
30
+ "execution_count": 2
31
+ },
32
+ {
33
+ "metadata": {
34
+ "ExecuteTime": {
35
+ "end_time": "2024-11-17T01:38:43.387546Z",
36
+ "start_time": "2024-11-17T01:38:41.072764Z"
37
+ }
38
+ },
39
+ "cell_type": "code",
40
+ "source": [
41
+ "import os\n",
42
+ "import torch\n",
43
+ "from tqdm import tqdm\n",
44
+ "import plotly.express as px"
45
+ ],
46
+ "id": "32b364abf1f61fe4",
47
+ "outputs": [],
48
+ "execution_count": 1
49
+ },
50
+ {
51
+ "metadata": {
52
+ "ExecuteTime": {
53
+ "end_time": "2024-11-17T01:38:58.504157Z",
54
+ "start_time": "2024-11-17T01:38:49.162496Z"
55
+ }
56
+ },
57
+ "cell_type": "code",
58
+ "source": [
59
+ "from datasets import load_dataset\n",
60
+ "from transformer_lens import HookedTransformer\n",
61
+ "from sae_lens import SAE\n",
62
+ "device = \"cuda\"\n",
63
+ "\n",
64
+ "model = HookedTransformer.from_pretrained(\"gpt2-small\", device=device)\n",
65
+ "\n",
66
+ "# the cfg dict is returned alongside the SAE since it may contain useful information for analysing the SAE (eg: instantiating an activation store)\n",
67
+ "# Note that this is not the same as the SAEs config dict, rather it is whatever was in the HF repo, from which we can extract the SAE config dict\n",
68
+ "# We also return the feature sparsities which are stored in HF for convenience.\n",
69
+ "sae, cfg_dict, sparsity = SAE.from_pretrained(\n",
70
+ " release=\"gpt2-small-res-jb\", # see other options in sae_lens/pretrained_saes.yaml\n",
71
+ " sae_id=\"blocks.8.hook_resid_pre\", # won't always be a hook point\n",
72
+ " device=device,\n",
73
+ ")"
74
+ ],
75
+ "id": "e76a79976976d7ea",
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "Loaded pretrained model gpt2-small into HookedTransformer\n"
82
+ ]
83
+ },
84
+ {
85
+ "name": "stderr",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "C:\\Users\\henry\\anaconda3\\envs\\dialignment\\lib\\site-packages\\sae_lens\\sae.py:145: UserWarning: \n",
89
+ "This SAE has non-empty model_from_pretrained_kwargs. \n",
90
+ "For optimal performance, load the model like so:\n",
91
+ "model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)\n",
92
+ " warnings.warn(\n"
93
+ ]
94
+ }
95
+ ],
96
+ "execution_count": 2
97
+ },
98
+ {
99
+ "metadata": {
100
+ "ExecuteTime": {
101
+ "end_time": "2024-11-17T01:38:59.683875Z",
102
+ "start_time": "2024-11-17T01:38:58.587175Z"
103
+ }
104
+ },
105
+ "cell_type": "code",
106
+ "source": [
107
+ "from transformer_lens.utils import tokenize_and_concatenate\n",
108
+ "\n",
109
+ "dataset = load_dataset(\n",
110
+ " path=\"NeelNanda/pile-10k\",\n",
111
+ " split=\"train\",\n",
112
+ " streaming=False,\n",
113
+ ")\n",
114
+ "\n",
115
+ "token_dataset = tokenize_and_concatenate(\n",
116
+ " dataset=dataset, # type: ignore\n",
117
+ " tokenizer=model.tokenizer, # type: ignore\n",
118
+ " streaming=True,\n",
119
+ " max_length=sae.cfg.context_size,\n",
120
+ " add_bos_token=sae.cfg.prepend_bos,\n",
121
+ ")"
122
+ ],
123
+ "id": "f1a688694a1c7e16",
124
+ "outputs": [],
125
+ "execution_count": 3
126
+ },
127
+ {
128
+ "metadata": {
129
+ "ExecuteTime": {
130
+ "end_time": "2024-11-17T01:39:25.256507Z",
131
+ "start_time": "2024-11-17T01:39:24.997695Z"
132
+ }
133
+ },
134
+ "cell_type": "code",
135
+ "source": [
136
+ "sae.eval() # prevents error if we're expecting a dead neuron mask for who grads\n",
137
+ "print(\"?\")\n",
138
+ "with torch.no_grad():\n",
139
+ " # activation store can give us tokens.\n",
140
+ " batch_tokens = token_dataset[:32][\"tokens\"]\n",
141
+ " _, cache = model.run_with_cache(batch_tokens, prepend_bos=True)\n",
142
+ "\n",
143
+ " # Use the SAE\n",
144
+ " feature_acts = sae.encode(cache[sae.cfg.hook_name])\n",
145
+ " sae_out = sae.decode(feature_acts)\n",
146
+ "\n",
147
+ " # save some room\n",
148
+ " del cache\n",
149
+ "\n",
150
+ " # ignore the bos token, get the number of features that activated in each token, averaged accross batch and position\n",
151
+ " l0 = (feature_acts[:, 1:] > 0).float().sum(-1).detach()\n",
152
+ " print(\"average l0\", l0.mean().item())\n",
153
+ " px.histogram(l0.flatten().cpu().numpy()).show()"
154
+ ],
155
+ "id": "a1f9a9f823253259",
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "?\n",
162
+ "average l0 64.1279525756836\n"
163
+ ]
164
+ },
165
+ {
166
+ "data": {
167
+ "application/vnd.plotly.v1+json": {
168
+ "data": [
169
+ {
170
+ "alignmentgroup": "True",
171
+ "bingroup": "x",
172
+ "hovertemplate": "variable=0<br>value=%{x}<br>count=%{y}<extra></extra>",
173
+ "legendgroup": "0",
174
+ "marker": {
175
+ "color": "#636efa",
176
+ "pattern": {
177
+ "shape": ""
178
+ }
179
+ },
180
+ "name": "0",
181
+ "offsetgroup": "0",
182
+ "orientation": "v",
183
+ "showlegend": true,
184
+ "x": [
185
+ 15.0,
186
+ 25.0,
187
+ 51.0,
188
+ 84.0,
189
+ 88.0,
190
+ 100.0,
191
+ 85.0,
192
+ 60.0,
193
+ 49.0,
194
+ 47.0,
195
+ 36.0,
196
+ 37.0,
197
+ 72.0,
198
+ 88.0,
199
+ 81.0,
200
+ 74.0,
201
+ 50.0,
202
+ 66.0,
203
+ 59.0,
204
+ 38.0,
205
+ 57.0,
206
+ 67.0,
207
+ 38.0,
208
+ 68.0,
209
+ 54.0,
210
+ 68.0,
211
+ 62.0,
212
+ 74.0,
213
+ 66.0,
214
+ 53.0,
215
+ 85.0,
216
+ 76.0,
217
+ 92.0,
218
+ 59.0,
219
+ 73.0,
220
+ 52.0,
221
+ 46.0,
222
+ 51.0,
223
+ 42.0,
224
+ 81.0,
225
+ 49.0,
226
+ 42.0,
227
+ 77.0,
228
+ 90.0,
229
+ 60.0,
230
+ 93.0,
231
+ 70.0,
232
+ 77.0,
233
+ 70.0,
234
+ 59.0,
235
+ 74.0,
236
+ 66.0,
237
+ 71.0,
238
+ 51.0,
239
+ 43.0,
240
+ 44.0,
241
+ 39.0,
242
+ 22.0,
243
+ 30.0,
244
+ 44.0,
245
+ 44.0,
246
+ 34.0,
247
+ 59.0,
248
+ 36.0,
249
+ 52.0,
250
+ 60.0,
251
+ 57.0,
252
+ 45.0,
253
+ 62.0,
254
+ 55.0,
255
+ 75.0,
256
+ 43.0,
257
+ 22.0,
258
+ 37.0,
259
+ 41.0,
260
+ 40.0,
261
+ 60.0,
262
+ 50.0,
263
+ 57.0,
264
+ 74.0,
265
+ 53.0,
266
+ 84.0,
267
+ 120.0,
268
+ 78.0,
269
+ 76.0,
270
+ 67.0,
271
+ 72.0,
272
+ 68.0,
273
+ 101.0,
274
+ 78.0,
275
+ 87.0,
276
+ 99.0,
277
+ 85.0,
278
+ 47.0,
279
+ 48.0,
280
+ 30.0,
281
+ 76.0,
282
+ 65.0,
283
+ 63.0,
284
+ 49.0,
285
+ 45.0,
286
+ 70.0,
287
+ 79.0,
288
+ 76.0,
289
+ 74.0,
290
+ 71.0,
291
+ 66.0,
292
+ 69.0,
293
+ 97.0,
294
+ 81.0,
295
+ 65.0,
296
+ 69.0,
297
+ 83.0,
298
+ 84.0,
299
+ 65.0,
300
+ 58.0,
301
+ 77.0,
302
+ 63.0,
303
+ 66.0,
304
+ 64.0,
305
+ 64.0,
306
+ 72.0,
307
+ 66.0,
308
+ 90.0,
309
+ 75.0,
310
+ 59.0,
311
+ 75.0,
312
+ 27.0,
313
+ 47.0,
314
+ 45.0,
315
+ 55.0,
316
+ 54.0,
317
+ 76.0,
318
+ 66.0,
319
+ 90.0,
320
+ 98.0,
321
+ 66.0,
322
+ 77.0,
323
+ 71.0,
324
+ 79.0,
325
+ 80.0,
326
+ 60.0,
327
+ 63.0,
328
+ 91.0,
329
+ 82.0,
330
+ 65.0,
331
+ 59.0,
332
+ 70.0,
333
+ 63.0,
334
+ 73.0,
335
+ 72.0,
336
+ 63.0,
337
+ 87.0,
338
+ 81.0,
339
+ 78.0,
340
+ 86.0,
341
+ 61.0,
342
+ 59.0,
343
+ 98.0,
344
+ 84.0,
345
+ 65.0,
346
+ 63.0,
347
+ 51.0,
348
+ 63.0,
349
+ 61.0,
350
+ 72.0,
351
+ 78.0,
352
+ 85.0,
353
+ 79.0,
354
+ 75.0,
355
+ 86.0,
356
+ 51.0,
357
+ 37.0,
358
+ 48.0,
359
+ 51.0,
360
+ 49.0,
361
+ 54.0,
362
+ 58.0,
363
+ 67.0,
364
+ 41.0,
365
+ 49.0,
366
+ 68.0,
367
+ 68.0,
368
+ 88.0,
369
+ 40.0,
370
+ 42.0,
371
+ 49.0,
372
+ 90.0,
373
+ 49.0,
374
+ 65.0,
375
+ 87.0,
376
+ 77.0,
377
+ 39.0,
378
+ 75.0,
379
+ 54.0,
380
+ 70.0,
381
+ 57.0,
382
+ 43.0,
383
+ 96.0,
384
+ 51.0,
385
+ 45.0,
386
+ 61.0,
387
+ 63.0,
388
+ 61.0,
389
+ 90.0,
390
+ 52.0,
391
+ 89.0,
392
+ 60.0,
393
+ 77.0,
394
+ 62.0,
395
+ 71.0,
396
+ 62.0,
397
+ 74.0,
398
+ 105.0,
399
+ 89.0,
400
+ 118.0,
401
+ 71.0,
402
+ 67.0,
403
+ 45.0,
404
+ 53.0,
405
+ 58.0,
406
+ 82.0,
407
+ 76.0,
408
+ 45.0,
409
+ 53.0,
410
+ 43.0,
411
+ 71.0,
412
+ 86.0,
413
+ 71.0,
414
+ 51.0,
415
+ 48.0,
416
+ 51.0,
417
+ 84.0,
418
+ 79.0,
419
+ 87.0,
420
+ 78.0,
421
+ 68.0,
422
+ 94.0,
423
+ 74.0,
424
+ 64.0,
425
+ 68.0,
426
+ 38.0,
427
+ 53.0,
428
+ 57.0,
429
+ 57.0,
430
+ 78.0,
431
+ 68.0,
432
+ 39.0,
433
+ 44.0,
434
+ 49.0,
435
+ 57.0,
436
+ 65.0,
437
+ 62.0,
438
+ 60.0,
439
+ 30.0,
440
+ 49.0,
441
+ 59.0,
442
+ 66.0,
443
+ 71.0,
444
+ 55.0,
445
+ 66.0,
446
+ 66.0,
447
+ 63.0,
448
+ 52.0,
449
+ 84.0,
450
+ 76.0,
451
+ 90.0,
452
+ 73.0,
453
+ 71.0,
454
+ 85.0,
455
+ 77.0,
456
+ 82.0,
457
+ 72.0,
458
+ 68.0,
459
+ 58.0,
460
+ 46.0,
461
+ 49.0,
462
+ 57.0,
463
+ 75.0,
464
+ 46.0,
465
+ 64.0,
466
+ 53.0,
467
+ 55.0,
468
+ 67.0,
469
+ 79.0,
470
+ 88.0,
471
+ 72.0,
472
+ 58.0,
473
+ 28.0,
474
+ 39.0,
475
+ 44.0,
476
+ 47.0,
477
+ 92.0,
478
+ 98.0,
479
+ 72.0,
480
+ 83.0,
481
+ 25.0,
482
+ 37.0,
483
+ 82.0,
484
+ 75.0,
485
+ 55.0,
486
+ 69.0,
487
+ 80.0,
488
+ 82.0,
489
+ 71.0,
490
+ 64.0,
491
+ 50.0,
492
+ 96.0,
493
+ 71.0,
494
+ 71.0,
495
+ 74.0,
496
+ 75.0,
497
+ 82.0,
498
+ 86.0,
499
+ 79.0,
500
+ 85.0,
501
+ 83.0,
502
+ 72.0,
503
+ 68.0,
504
+ 55.0,
505
+ 40.0,
506
+ 49.0,
507
+ 76.0,
508
+ 82.0,
509
+ 83.0,
510
+ 78.0,
511
+ 70.0,
512
+ 108.0,
513
+ 81.0,
514
+ 54.0,
515
+ 22.0,
516
+ 40.0,
517
+ 41.0,
518
+ 59.0,
519
+ 42.0,
520
+ 48.0,
521
+ 68.0,
522
+ 70.0,
523
+ 95.0,
524
+ 120.0,
525
+ 75.0,
526
+ 52.0,
527
+ 32.0,
528
+ 33.0,
529
+ 21.0,
530
+ 69.0,
531
+ 57.0,
532
+ 52.0,
533
+ 55.0,
534
+ 48.0,
535
+ 47.0,
536
+ 91.0,
537
+ 60.0,
538
+ 68.0,
539
+ 54.0,
540
+ 62.0,
541
+ 65.0,
542
+ 75.0,
543
+ 74.0,
544
+ 73.0,
545
+ 71.0,
546
+ 87.0,
547
+ 61.0,
548
+ 57.0,
549
+ 75.0,
550
+ 83.0,
551
+ 73.0,
552
+ 104.0,
553
+ 86.0,
554
+ 112.0,
555
+ 82.0,
556
+ 74.0,
557
+ 72.0,
558
+ 53.0,
559
+ 54.0,
560
+ 27.0,
561
+ 35.0,
562
+ 61.0,
563
+ 65.0,
564
+ 70.0,
565
+ 70.0,
566
+ 6.0,
567
+ 26.0,
568
+ 21.0,
569
+ 42.0,
570
+ 71.0,
571
+ 87.0,
572
+ 32.0,
573
+ 45.0,
574
+ 88.0,
575
+ 65.0,
576
+ 74.0,
577
+ 62.0,
578
+ 68.0,
579
+ 65.0,
580
+ 55.0,
581
+ 40.0,
582
+ 38.0,
583
+ 28.0,
584
+ 34.0,
585
+ 34.0,
586
+ 42.0,
587
+ 47.0,
588
+ 78.0,
589
+ 47.0,
590
+ 72.0,
591
+ 78.0,
592
+ 61.0,
593
+ 79.0,
594
+ 106.0,
595
+ 75.0,
596
+ 95.0,
597
+ 68.0,
598
+ 70.0,
599
+ 49.0,
600
+ 54.0,
601
+ 69.0,
602
+ 73.0,
603
+ 85.0,
604
+ 69.0,
605
+ 71.0,
606
+ 56.0,
607
+ 64.0,
608
+ 77.0,
609
+ 84.0,
610
+ 79.0,
611
+ 90.0,
612
+ 86.0,
613
+ 79.0,
614
+ 34.0,
615
+ 27.0,
616
+ 29.0,
617
+ 37.0,
618
+ 46.0,
619
+ 55.0,
620
+ 53.0,
621
+ 48.0,
622
+ 48.0,
623
+ 58.0,
624
+ 58.0,
625
+ 52.0,
626
+ 61.0,
627
+ 58.0,
628
+ 42.0,
629
+ 75.0,
630
+ 83.0,
631
+ 60.0,
632
+ 63.0,
633
+ 39.0,
634
+ 33.0,
635
+ 52.0,
636
+ 46.0,
637
+ 55.0,
638
+ 29.0,
639
+ 34.0,
640
+ 51.0,
641
+ 54.0,
642
+ 64.0,
643
+ 90.0,
644
+ 63.0,
645
+ 59.0,
646
+ 91.0,
647
+ 62.0,
648
+ 77.0,
649
+ 87.0,
650
+ 74.0,
651
+ 39.0,
652
+ 44.0,
653
+ 32.0,
654
+ 84.0,
655
+ 53.0,
656
+ 32.0,
657
+ 41.0,
658
+ 46.0,
659
+ 45.0,
660
+ 48.0,
661
+ 68.0,
662
+ 78.0,
663
+ 41.0,
664
+ 45.0,
665
+ 54.0,
666
+ 72.0,
667
+ 61.0,
668
+ 70.0,
669
+ 62.0,
670
+ 54.0,
671
+ 71.0,
672
+ 80.0,
673
+ 92.0,
674
+ 89.0,
675
+ 73.0,
676
+ 99.0,
677
+ 85.0,
678
+ 83.0,
679
+ 92.0,
680
+ 79.0,
681
+ 67.0,
682
+ 68.0,
683
+ 78.0,
684
+ 90.0,
685
+ 72.0,
686
+ 80.0,
687
+ 95.0,
688
+ 78.0,
689
+ 75.0,
690
+ 48.0,
691
+ 47.0,
692
+ 61.0
693
+ ],
694
+ "xaxis": "x",
695
+ "yaxis": "y",
696
+ "type": "histogram"
697
+ }
698
+ ],
699
+ "layout": {
700
+ "template": {
701
+ "data": {
702
+ "histogram2dcontour": [
703
+ {
704
+ "type": "histogram2dcontour",
705
+ "colorbar": {
706
+ "outlinewidth": 0,
707
+ "ticks": ""
708
+ },
709
+ "colorscale": [
710
+ [
711
+ 0.0,
712
+ "#0d0887"
713
+ ],
714
+ [
715
+ 0.1111111111111111,
716
+ "#46039f"
717
+ ],
718
+ [
719
+ 0.2222222222222222,
720
+ "#7201a8"
721
+ ],
722
+ [
723
+ 0.3333333333333333,
724
+ "#9c179e"
725
+ ],
726
+ [
727
+ 0.4444444444444444,
728
+ "#bd3786"
729
+ ],
730
+ [
731
+ 0.5555555555555556,
732
+ "#d8576b"
733
+ ],
734
+ [
735
+ 0.6666666666666666,
736
+ "#ed7953"
737
+ ],
738
+ [
739
+ 0.7777777777777778,
740
+ "#fb9f3a"
741
+ ],
742
+ [
743
+ 0.8888888888888888,
744
+ "#fdca26"
745
+ ],
746
+ [
747
+ 1.0,
748
+ "#f0f921"
749
+ ]
750
+ ]
751
+ }
752
+ ],
753
+ "choropleth": [
754
+ {
755
+ "type": "choropleth",
756
+ "colorbar": {
757
+ "outlinewidth": 0,
758
+ "ticks": ""
759
+ }
760
+ }
761
+ ],
762
+ "histogram2d": [
763
+ {
764
+ "type": "histogram2d",
765
+ "colorbar": {
766
+ "outlinewidth": 0,
767
+ "ticks": ""
768
+ },
769
+ "colorscale": [
770
+ [
771
+ 0.0,
772
+ "#0d0887"
773
+ ],
774
+ [
775
+ 0.1111111111111111,
776
+ "#46039f"
777
+ ],
778
+ [
779
+ 0.2222222222222222,
780
+ "#7201a8"
781
+ ],
782
+ [
783
+ 0.3333333333333333,
784
+ "#9c179e"
785
+ ],
786
+ [
787
+ 0.4444444444444444,
788
+ "#bd3786"
789
+ ],
790
+ [
791
+ 0.5555555555555556,
792
+ "#d8576b"
793
+ ],
794
+ [
795
+ 0.6666666666666666,
796
+ "#ed7953"
797
+ ],
798
+ [
799
+ 0.7777777777777778,
800
+ "#fb9f3a"
801
+ ],
802
+ [
803
+ 0.8888888888888888,
804
+ "#fdca26"
805
+ ],
806
+ [
807
+ 1.0,
808
+ "#f0f921"
809
+ ]
810
+ ]
811
+ }
812
+ ],
813
+ "heatmap": [
814
+ {
815
+ "type": "heatmap",
816
+ "colorbar": {
817
+ "outlinewidth": 0,
818
+ "ticks": ""
819
+ },
820
+ "colorscale": [
821
+ [
822
+ 0.0,
823
+ "#0d0887"
824
+ ],
825
+ [
826
+ 0.1111111111111111,
827
+ "#46039f"
828
+ ],
829
+ [
830
+ 0.2222222222222222,
831
+ "#7201a8"
832
+ ],
833
+ [
834
+ 0.3333333333333333,
835
+ "#9c179e"
836
+ ],
837
+ [
838
+ 0.4444444444444444,
839
+ "#bd3786"
840
+ ],
841
+ [
842
+ 0.5555555555555556,
843
+ "#d8576b"
844
+ ],
845
+ [
846
+ 0.6666666666666666,
847
+ "#ed7953"
848
+ ],
849
+ [
850
+ 0.7777777777777778,
851
+ "#fb9f3a"
852
+ ],
853
+ [
854
+ 0.8888888888888888,
855
+ "#fdca26"
856
+ ],
857
+ [
858
+ 1.0,
859
+ "#f0f921"
860
+ ]
861
+ ]
862
+ }
863
+ ],
864
+ "heatmapgl": [
865
+ {
866
+ "type": "heatmapgl",
867
+ "colorbar": {
868
+ "outlinewidth": 0,
869
+ "ticks": ""
870
+ },
871
+ "colorscale": [
872
+ [
873
+ 0.0,
874
+ "#0d0887"
875
+ ],
876
+ [
877
+ 0.1111111111111111,
878
+ "#46039f"
879
+ ],
880
+ [
881
+ 0.2222222222222222,
882
+ "#7201a8"
883
+ ],
884
+ [
885
+ 0.3333333333333333,
886
+ "#9c179e"
887
+ ],
888
+ [
889
+ 0.4444444444444444,
890
+ "#bd3786"
891
+ ],
892
+ [
893
+ 0.5555555555555556,
894
+ "#d8576b"
895
+ ],
896
+ [
897
+ 0.6666666666666666,
898
+ "#ed7953"
899
+ ],
900
+ [
901
+ 0.7777777777777778,
902
+ "#fb9f3a"
903
+ ],
904
+ [
905
+ 0.8888888888888888,
906
+ "#fdca26"
907
+ ],
908
+ [
909
+ 1.0,
910
+ "#f0f921"
911
+ ]
912
+ ]
913
+ }
914
+ ],
915
+ "contourcarpet": [
916
+ {
917
+ "type": "contourcarpet",
918
+ "colorbar": {
919
+ "outlinewidth": 0,
920
+ "ticks": ""
921
+ }
922
+ }
923
+ ],
924
+ "contour": [
925
+ {
926
+ "type": "contour",
927
+ "colorbar": {
928
+ "outlinewidth": 0,
929
+ "ticks": ""
930
+ },
931
+ "colorscale": [
932
+ [
933
+ 0.0,
934
+ "#0d0887"
935
+ ],
936
+ [
937
+ 0.1111111111111111,
938
+ "#46039f"
939
+ ],
940
+ [
941
+ 0.2222222222222222,
942
+ "#7201a8"
943
+ ],
944
+ [
945
+ 0.3333333333333333,
946
+ "#9c179e"
947
+ ],
948
+ [
949
+ 0.4444444444444444,
950
+ "#bd3786"
951
+ ],
952
+ [
953
+ 0.5555555555555556,
954
+ "#d8576b"
955
+ ],
956
+ [
957
+ 0.6666666666666666,
958
+ "#ed7953"
959
+ ],
960
+ [
961
+ 0.7777777777777778,
962
+ "#fb9f3a"
963
+ ],
964
+ [
965
+ 0.8888888888888888,
966
+ "#fdca26"
967
+ ],
968
+ [
969
+ 1.0,
970
+ "#f0f921"
971
+ ]
972
+ ]
973
+ }
974
+ ],
975
+ "surface": [
976
+ {
977
+ "type": "surface",
978
+ "colorbar": {
979
+ "outlinewidth": 0,
980
+ "ticks": ""
981
+ },
982
+ "colorscale": [
983
+ [
984
+ 0.0,
985
+ "#0d0887"
986
+ ],
987
+ [
988
+ 0.1111111111111111,
989
+ "#46039f"
990
+ ],
991
+ [
992
+ 0.2222222222222222,
993
+ "#7201a8"
994
+ ],
995
+ [
996
+ 0.3333333333333333,
997
+ "#9c179e"
998
+ ],
999
+ [
1000
+ 0.4444444444444444,
1001
+ "#bd3786"
1002
+ ],
1003
+ [
1004
+ 0.5555555555555556,
1005
+ "#d8576b"
1006
+ ],
1007
+ [
1008
+ 0.6666666666666666,
1009
+ "#ed7953"
1010
+ ],
1011
+ [
1012
+ 0.7777777777777778,
1013
+ "#fb9f3a"
1014
+ ],
1015
+ [
1016
+ 0.8888888888888888,
1017
+ "#fdca26"
1018
+ ],
1019
+ [
1020
+ 1.0,
1021
+ "#f0f921"
1022
+ ]
1023
+ ]
1024
+ }
1025
+ ],
1026
+ "mesh3d": [
1027
+ {
1028
+ "type": "mesh3d",
1029
+ "colorbar": {
1030
+ "outlinewidth": 0,
1031
+ "ticks": ""
1032
+ }
1033
+ }
1034
+ ],
1035
+ "scatter": [
1036
+ {
1037
+ "marker": {
1038
+ "line": {
1039
+ "color": "#283442"
1040
+ }
1041
+ },
1042
+ "type": "scatter"
1043
+ }
1044
+ ],
1045
+ "parcoords": [
1046
+ {
1047
+ "type": "parcoords",
1048
+ "line": {
1049
+ "colorbar": {
1050
+ "outlinewidth": 0,
1051
+ "ticks": ""
1052
+ }
1053
+ }
1054
+ }
1055
+ ],
1056
+ "scatterpolargl": [
1057
+ {
1058
+ "type": "scatterpolargl",
1059
+ "marker": {
1060
+ "colorbar": {
1061
+ "outlinewidth": 0,
1062
+ "ticks": ""
1063
+ }
1064
+ }
1065
+ }
1066
+ ],
1067
+ "bar": [
1068
+ {
1069
+ "error_x": {
1070
+ "color": "#f2f5fa"
1071
+ },
1072
+ "error_y": {
1073
+ "color": "#f2f5fa"
1074
+ },
1075
+ "marker": {
1076
+ "line": {
1077
+ "color": "rgb(17,17,17)",
1078
+ "width": 0.5
1079
+ },
1080
+ "pattern": {
1081
+ "fillmode": "overlay",
1082
+ "size": 10,
1083
+ "solidity": 0.2
1084
+ }
1085
+ },
1086
+ "type": "bar"
1087
+ }
1088
+ ],
1089
+ "scattergeo": [
1090
+ {
1091
+ "type": "scattergeo",
1092
+ "marker": {
1093
+ "colorbar": {
1094
+ "outlinewidth": 0,
1095
+ "ticks": ""
1096
+ }
1097
+ }
1098
+ }
1099
+ ],
1100
+ "scatterpolar": [
1101
+ {
1102
+ "type": "scatterpolar",
1103
+ "marker": {
1104
+ "colorbar": {
1105
+ "outlinewidth": 0,
1106
+ "ticks": ""
1107
+ }
1108
+ }
1109
+ }
1110
+ ],
1111
+ "histogram": [
1112
+ {
1113
+ "marker": {
1114
+ "pattern": {
1115
+ "fillmode": "overlay",
1116
+ "size": 10,
1117
+ "solidity": 0.2
1118
+ }
1119
+ },
1120
+ "type": "histogram"
1121
+ }
1122
+ ],
1123
+ "scattergl": [
1124
+ {
1125
+ "marker": {
1126
+ "line": {
1127
+ "color": "#283442"
1128
+ }
1129
+ },
1130
+ "type": "scattergl"
1131
+ }
1132
+ ],
1133
+ "scatter3d": [
1134
+ {
1135
+ "type": "scatter3d",
1136
+ "line": {
1137
+ "colorbar": {
1138
+ "outlinewidth": 0,
1139
+ "ticks": ""
1140
+ }
1141
+ },
1142
+ "marker": {
1143
+ "colorbar": {
1144
+ "outlinewidth": 0,
1145
+ "ticks": ""
1146
+ }
1147
+ }
1148
+ }
1149
+ ],
1150
+ "scattermapbox": [
1151
+ {
1152
+ "type": "scattermapbox",
1153
+ "marker": {
1154
+ "colorbar": {
1155
+ "outlinewidth": 0,
1156
+ "ticks": ""
1157
+ }
1158
+ }
1159
+ }
1160
+ ],
1161
+ "scatterternary": [
1162
+ {
1163
+ "type": "scatterternary",
1164
+ "marker": {
1165
+ "colorbar": {
1166
+ "outlinewidth": 0,
1167
+ "ticks": ""
1168
+ }
1169
+ }
1170
+ }
1171
+ ],
1172
+ "scattercarpet": [
1173
+ {
1174
+ "type": "scattercarpet",
1175
+ "marker": {
1176
+ "colorbar": {
1177
+ "outlinewidth": 0,
1178
+ "ticks": ""
1179
+ }
1180
+ }
1181
+ }
1182
+ ],
1183
+ "carpet": [
1184
+ {
1185
+ "aaxis": {
1186
+ "endlinecolor": "#A2B1C6",
1187
+ "gridcolor": "#506784",
1188
+ "linecolor": "#506784",
1189
+ "minorgridcolor": "#506784",
1190
+ "startlinecolor": "#A2B1C6"
1191
+ },
1192
+ "baxis": {
1193
+ "endlinecolor": "#A2B1C6",
1194
+ "gridcolor": "#506784",
1195
+ "linecolor": "#506784",
1196
+ "minorgridcolor": "#506784",
1197
+ "startlinecolor": "#A2B1C6"
1198
+ },
1199
+ "type": "carpet"
1200
+ }
1201
+ ],
1202
+ "table": [
1203
+ {
1204
+ "cells": {
1205
+ "fill": {
1206
+ "color": "#506784"
1207
+ },
1208
+ "line": {
1209
+ "color": "rgb(17,17,17)"
1210
+ }
1211
+ },
1212
+ "header": {
1213
+ "fill": {
1214
+ "color": "#2a3f5f"
1215
+ },
1216
+ "line": {
1217
+ "color": "rgb(17,17,17)"
1218
+ }
1219
+ },
1220
+ "type": "table"
1221
+ }
1222
+ ],
1223
+ "barpolar": [
1224
+ {
1225
+ "marker": {
1226
+ "line": {
1227
+ "color": "rgb(17,17,17)",
1228
+ "width": 0.5
1229
+ },
1230
+ "pattern": {
1231
+ "fillmode": "overlay",
1232
+ "size": 10,
1233
+ "solidity": 0.2
1234
+ }
1235
+ },
1236
+ "type": "barpolar"
1237
+ }
1238
+ ],
1239
+ "pie": [
1240
+ {
1241
+ "automargin": true,
1242
+ "type": "pie"
1243
+ }
1244
+ ]
1245
+ },
1246
+ "layout": {
1247
+ "autotypenumbers": "strict",
1248
+ "colorway": [
1249
+ "#636efa",
1250
+ "#EF553B",
1251
+ "#00cc96",
1252
+ "#ab63fa",
1253
+ "#FFA15A",
1254
+ "#19d3f3",
1255
+ "#FF6692",
1256
+ "#B6E880",
1257
+ "#FF97FF",
1258
+ "#FECB52"
1259
+ ],
1260
+ "font": {
1261
+ "color": "#f2f5fa"
1262
+ },
1263
+ "hovermode": "closest",
1264
+ "hoverlabel": {
1265
+ "align": "left"
1266
+ },
1267
+ "paper_bgcolor": "rgb(17,17,17)",
1268
+ "plot_bgcolor": "rgb(17,17,17)",
1269
+ "polar": {
1270
+ "bgcolor": "rgb(17,17,17)",
1271
+ "angularaxis": {
1272
+ "gridcolor": "#506784",
1273
+ "linecolor": "#506784",
1274
+ "ticks": ""
1275
+ },
1276
+ "radialaxis": {
1277
+ "gridcolor": "#506784",
1278
+ "linecolor": "#506784",
1279
+ "ticks": ""
1280
+ }
1281
+ },
1282
+ "ternary": {
1283
+ "bgcolor": "rgb(17,17,17)",
1284
+ "aaxis": {
1285
+ "gridcolor": "#506784",
1286
+ "linecolor": "#506784",
1287
+ "ticks": ""
1288
+ },
1289
+ "baxis": {
1290
+ "gridcolor": "#506784",
1291
+ "linecolor": "#506784",
1292
+ "ticks": ""
1293
+ },
1294
+ "caxis": {
1295
+ "gridcolor": "#506784",
1296
+ "linecolor": "#506784",
1297
+ "ticks": ""
1298
+ }
1299
+ },
1300
+ "coloraxis": {
1301
+ "colorbar": {
1302
+ "outlinewidth": 0,
1303
+ "ticks": ""
1304
+ }
1305
+ },
1306
+ "colorscale": {
1307
+ "sequential": [
1308
+ [
1309
+ 0.0,
1310
+ "#0d0887"
1311
+ ],
1312
+ [
1313
+ 0.1111111111111111,
1314
+ "#46039f"
1315
+ ],
1316
+ [
1317
+ 0.2222222222222222,
1318
+ "#7201a8"
1319
+ ],
1320
+ [
1321
+ 0.3333333333333333,
1322
+ "#9c179e"
1323
+ ],
1324
+ [
1325
+ 0.4444444444444444,
1326
+ "#bd3786"
1327
+ ],
1328
+ [
1329
+ 0.5555555555555556,
1330
+ "#d8576b"
1331
+ ],
1332
+ [
1333
+ 0.6666666666666666,
1334
+ "#ed7953"
1335
+ ],
1336
+ [
1337
+ 0.7777777777777778,
1338
+ "#fb9f3a"
1339
+ ],
1340
+ [
1341
+ 0.8888888888888888,
1342
+ "#fdca26"
1343
+ ],
1344
+ [
1345
+ 1.0,
1346
+ "#f0f921"
1347
+ ]
1348
+ ],
1349
+ "sequentialminus": [
1350
+ [
1351
+ 0.0,
1352
+ "#0d0887"
1353
+ ],
1354
+ [
1355
+ 0.1111111111111111,
1356
+ "#46039f"
1357
+ ],
1358
+ [
1359
+ 0.2222222222222222,
1360
+ "#7201a8"
1361
+ ],
1362
+ [
1363
+ 0.3333333333333333,
1364
+ "#9c179e"
1365
+ ],
1366
+ [
1367
+ 0.4444444444444444,
1368
+ "#bd3786"
1369
+ ],
1370
+ [
1371
+ 0.5555555555555556,
1372
+ "#d8576b"
1373
+ ],
1374
+ [
1375
+ 0.6666666666666666,
1376
+ "#ed7953"
1377
+ ],
1378
+ [
1379
+ 0.7777777777777778,
1380
+ "#fb9f3a"
1381
+ ],
1382
+ [
1383
+ 0.8888888888888888,
1384
+ "#fdca26"
1385
+ ],
1386
+ [
1387
+ 1.0,
1388
+ "#f0f921"
1389
+ ]
1390
+ ],
1391
+ "diverging": [
1392
+ [
1393
+ 0,
1394
+ "#8e0152"
1395
+ ],
1396
+ [
1397
+ 0.1,
1398
+ "#c51b7d"
1399
+ ],
1400
+ [
1401
+ 0.2,
1402
+ "#de77ae"
1403
+ ],
1404
+ [
1405
+ 0.3,
1406
+ "#f1b6da"
1407
+ ],
1408
+ [
1409
+ 0.4,
1410
+ "#fde0ef"
1411
+ ],
1412
+ [
1413
+ 0.5,
1414
+ "#f7f7f7"
1415
+ ],
1416
+ [
1417
+ 0.6,
1418
+ "#e6f5d0"
1419
+ ],
1420
+ [
1421
+ 0.7,
1422
+ "#b8e186"
1423
+ ],
1424
+ [
1425
+ 0.8,
1426
+ "#7fbc41"
1427
+ ],
1428
+ [
1429
+ 0.9,
1430
+ "#4d9221"
1431
+ ],
1432
+ [
1433
+ 1,
1434
+ "#276419"
1435
+ ]
1436
+ ]
1437
+ },
1438
+ "xaxis": {
1439
+ "gridcolor": "#283442",
1440
+ "linecolor": "#506784",
1441
+ "ticks": "",
1442
+ "title": {
1443
+ "standoff": 15
1444
+ },
1445
+ "zerolinecolor": "#283442",
1446
+ "automargin": true,
1447
+ "zerolinewidth": 2
1448
+ },
1449
+ "yaxis": {
1450
+ "gridcolor": "#283442",
1451
+ "linecolor": "#506784",
1452
+ "ticks": "",
1453
+ "title": {
1454
+ "standoff": 15
1455
+ },
1456
+ "zerolinecolor": "#283442",
1457
+ "automargin": true,
1458
+ "zerolinewidth": 2
1459
+ },
1460
+ "scene": {
1461
+ "xaxis": {
1462
+ "backgroundcolor": "rgb(17,17,17)",
1463
+ "gridcolor": "#506784",
1464
+ "linecolor": "#506784",
1465
+ "showbackground": true,
1466
+ "ticks": "",
1467
+ "zerolinecolor": "#C8D4E3",
1468
+ "gridwidth": 2
1469
+ },
1470
+ "yaxis": {
1471
+ "backgroundcolor": "rgb(17,17,17)",
1472
+ "gridcolor": "#506784",
1473
+ "linecolor": "#506784",
1474
+ "showbackground": true,
1475
+ "ticks": "",
1476
+ "zerolinecolor": "#C8D4E3",
1477
+ "gridwidth": 2
1478
+ },
1479
+ "zaxis": {
1480
+ "backgroundcolor": "rgb(17,17,17)",
1481
+ "gridcolor": "#506784",
1482
+ "linecolor": "#506784",
1483
+ "showbackground": true,
1484
+ "ticks": "",
1485
+ "zerolinecolor": "#C8D4E3",
1486
+ "gridwidth": 2
1487
+ }
1488
+ },
1489
+ "shapedefaults": {
1490
+ "line": {
1491
+ "color": "#f2f5fa"
1492
+ }
1493
+ },
1494
+ "annotationdefaults": {
1495
+ "arrowcolor": "#f2f5fa",
1496
+ "arrowhead": 0,
1497
+ "arrowwidth": 1
1498
+ },
1499
+ "geo": {
1500
+ "bgcolor": "rgb(17,17,17)",
1501
+ "landcolor": "rgb(17,17,17)",
1502
+ "subunitcolor": "#506784",
1503
+ "showland": true,
1504
+ "showlakes": true,
1505
+ "lakecolor": "rgb(17,17,17)"
1506
+ },
1507
+ "title": {
1508
+ "x": 0.05
1509
+ },
1510
+ "updatemenudefaults": {
1511
+ "bgcolor": "#506784",
1512
+ "borderwidth": 0
1513
+ },
1514
+ "sliderdefaults": {
1515
+ "bgcolor": "#C8D4E3",
1516
+ "borderwidth": 1,
1517
+ "bordercolor": "rgb(17,17,17)",
1518
+ "tickwidth": 0
1519
+ },
1520
+ "mapbox": {
1521
+ "style": "dark"
1522
+ }
1523
+ }
1524
+ },
1525
+ "xaxis": {
1526
+ "anchor": "y",
1527
+ "domain": [
1528
+ 0.0,
1529
+ 1.0
1530
+ ],
1531
+ "title": {
1532
+ "text": "value"
1533
+ }
1534
+ },
1535
+ "yaxis": {
1536
+ "anchor": "x",
1537
+ "domain": [
1538
+ 0.0,
1539
+ 1.0
1540
+ ],
1541
+ "title": {
1542
+ "text": "count"
1543
+ }
1544
+ },
1545
+ "legend": {
1546
+ "title": {
1547
+ "text": "variable"
1548
+ },
1549
+ "tracegroupgap": 0
1550
+ },
1551
+ "margin": {
1552
+ "t": 60
1553
+ },
1554
+ "barmode": "relative"
1555
+ },
1556
+ "config": {
1557
+ "plotlyServerURL": "https://plot.ly"
1558
+ }
1559
+ },
1560
+ "text/html": [
1561
+ "<div> <div id=\"123c5b41-465a-466f-8bd4-1be7a1927f18\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"123c5b41-465a-466f-8bd4-1be7a1927f18\")) { Plotly.newPlot( \"123c5b41-465a-466f-8bd4-1be7a1927f18\", [{\"alignmentgroup\":\"True\",\"bingroup\":\"x\",\"hovertemplate\":\"variable=0\\u003cbr\\u003evalue=%{x}\\u003cbr\\u003ecount=%{y}\\u003cextra\\u003e\\u003c\\u002fextra\\u003e\",\"legendgroup\":\"0\",\"marker\":{\"color\":\"#636efa\",\"pattern\":{\"shape\":\"\"}},\"name\":\"0\",\"offsetgroup\":\"0\",\"orientation\":\"v\",\"showlegend\":true,\"x\":[15.0,25.0,51.0,84.0,88.0,100.0,85.0,60.0,49.0,47.0,36.0,37.0,72.0,88.0,81.0,74.0,50.0,66.0,59.0,38.0,57.0,67.0,38.0,68.0,54.0,68.0,62.0,74.0,66.0,53.0,85.0,76.0,92.0,59.0,73.0,52.0,46.0,51.0,42.0,81.0,49.0,42.0,77.0,90.0,60.0,93.0,70.0,77.0,70.0,59.0,74.0,66.0,71.0,51.0,43.0,44.0,39.0,22.0,30.0,44.0,44.0,34.0,59.0,36.0,52.0,60.0,57.0,45.0,62.0,55.0,75.0,43.0,22.0,37.0,41.0,40.0,60.0,50.0,57.0,74.0,53.0,84.0,120.0,78.0,76.0,67.0,72.0,68.0,101.0,78.0,87.0,99.0,85.0,47.0,48.0,30.0,76.0,65.0,63.0,49.0,45.0,70.0,79.0,76.0,74.0,71.0,66.0,69.0,97.0,81.0,65.0,69.0,83.0,84.0,65.0,58.0,77.0,63.0,66.0,64.0,64.0,72.0,66.0,90.0,75.0,59.0,75.0,27.0,47.0,45.0,55.0,54.0,76.0,66.0,90.0,98.0,66.0,77.0,71.0,79.0,80.0,60.0,63.0,91.0,82.0,65.0,59.0,70.0,63.0,73.0,72.0,63.0,87.0,81.0,78.0,86.0,61.0,59.0,98.0,84.0,65.0,63.0,51.0,63.0,61.0,72.0,78.0,85.0,79.0,75.0,86.0,51.0,37.0,48.0,51.0,49.0,54.0,58.0,67.0,41.0,49.0,68.0,68.0,88.0,40.0,42.0,49.0,90.0,49.0,65.0,87.0,77.0,39.0,75.0,54.0,70.0,57.0,43.0,96.0,51.0,45.0,61.0,63.0,61.0,90.0,52.0,89.0,60.0,77.0,62.0,71.0,62.0,74.0,105.0,89.0,118.0,71.0,67.0,45.0,53.0,58.0,82.0,76.0,45.0,53.0,43.0,71.0,86.0,71.0,51.0,48.0,51.0,84.0,79.0,87.0,78.0,68.0,94.0,74.0,64.0,68.0,38.0,53.0,57.0,57.0,78.0,68.0,39.0,44.0,49.0,57.0,65.0,62.0,60.0,30.0,49.0,59.0,66.0,71.0,55.0,66.0,66.0,63.0,52.0,84.0,76.0,90.0,73.0,71.0,85.0,77.0,82.0,72.0,68.0,58.0,46.0,49.0,57.0,75.0,46.0,64.0,53.0,55.0,67.0,79.0,88.0,72.0,58.0,28.0,39.0,44.0,47.0,92.0,98.0,72.0,83.0,25.0,37.0,82.0,75.0,55.0,69.0,80.0,82.0,71.0,64.0,50.0,96.0,71.0,71.0,74.0,75.0,82.0,86.0,79.0,85.0,83.0,72.0,68.0,55.0,40.0,49.0,76.0,82.0,83.0,78.0,70.0,108.0,81.0,54.0,22.0,40.0,41.0,59.0,42.0,48.0,68.0,70.0,95.0,120.0,75.0,52.0,32.0,33.0,21.0,69.0,57.0,52.0,55.0,48.0,47.0,91.0,60.0,68.0,54.0,62.0,65.0,75.0,74.0,73.0,71.0,87.0,61.0,57.0,75.0,83.0,73.0,104.0,86.0,112.0,82.0,74.0,72.0,53.0,54.0,27.0,35.0,61.0,65.0,70.0,70.0,6.0,26.0,21.0,42.0,71.0,87.0,32.0,45.0,88.0,65.0,74.0,62.0,68.0,65.0,55.0,40.0,38.0,28.0,34.0,34.0,42.0,47.0,78.0,47.0,72.0,78.0,61.0,79.0,106.0,75.0,95.0,68.0,70.0,49.0,54.0,69.0,73.0,85.0,69.0,71.0,56.0,64.0,77.0,84.0,79.0,90.0,86.0,79.0,34.0,27.0,29.0,37.0,46.0,55.0,53.0,48.0,48.0,58.0,58.0,52.0,61.0,58.0,42.0,75.0,83.0,60.0,63.0,39.0,33.0,52.0,46.0,55.0,29.0,34.0,51.0,54.0,64.0,90.0,63.0,59.0,91.0,62.0,77.0,87.0,74.0,39.0,44.0,32.0,84.0,53.0,32.0,41.0,46.0,45.0,48.0,68.0,78.0,41.0,45.0,54.0,72.0,61.0,70.0,62.0,54.0,71.0,80.0,92.0,89.0,73.0,99.0,85.0,83.0,92.0,79.0,67.0,68.0,78.0,90.0,72.0,80.0,95.0,78.0,75.0,48.0,47.0,61.0],\"xaxis\":\"x\",\"yaxis\":\"y\",\"type\":\"histogram\"}], {\"template\":{\"data\":{\"histogram2dcontour\":[{\"type\":\"histogram2dcontour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"choropleth\":[{\"type\":\"choropleth\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"histogram2d\":[{\"type\":\"histogram2d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmap\":[{\"type\":\"heatmap\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"heatmapgl\":[{\"type\":\"heatmapgl\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"contourcarpet\":[{\"type\":\"contourcarpet\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"contour\":[{\"type\":\"contour\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"surface\":[{\"type\":\"surface\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"},\"colorscale\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]]}],\"mesh3d\":[{\"type\":\"mesh3d\",\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}],\"scatter\":[{\"marker\":{\"line\":{\"color\":\"#283442\"}},\"type\":\"scatter\"}],\"parcoords\":[{\"type\":\"parcoords\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolargl\":[{\"type\":\"scatterpolargl\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"bar\":[{\"error_x\":{\"color\":\"#f2f5fa\"},\"error_y\":{\"color\":\"#f2f5fa\"},\"marker\":{\"line\":{\"color\":\"rgb(17,17,17)\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"bar\"}],\"scattergeo\":[{\"type\":\"scattergeo\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterpolar\":[{\"type\":\"scatterpolar\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"histogram\":[{\"marker\":{\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"histogram\"}],\"scattergl\":[{\"marker\":{\"line\":{\"color\":\"#283442\"}},\"type\":\"scattergl\"}],\"scatter3d\":[{\"type\":\"scatter3d\",\"line\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattermapbox\":[{\"type\":\"scattermapbox\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scatterternary\":[{\"type\":\"scatterternary\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"scattercarpet\":[{\"type\":\"scattercarpet\",\"marker\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}}}],\"carpet\":[{\"aaxis\":{\"endlinecolor\":\"#A2B1C6\",\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"minorgridcolor\":\"#506784\",\"startlinecolor\":\"#A2B1C6\"},\"baxis\":{\"endlinecolor\":\"#A2B1C6\",\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"minorgridcolor\":\"#506784\",\"startlinecolor\":\"#A2B1C6\"},\"type\":\"carpet\"}],\"table\":[{\"cells\":{\"fill\":{\"color\":\"#506784\"},\"line\":{\"color\":\"rgb(17,17,17)\"}},\"header\":{\"fill\":{\"color\":\"#2a3f5f\"},\"line\":{\"color\":\"rgb(17,17,17)\"}},\"type\":\"table\"}],\"barpolar\":[{\"marker\":{\"line\":{\"color\":\"rgb(17,17,17)\",\"width\":0.5},\"pattern\":{\"fillmode\":\"overlay\",\"size\":10,\"solidity\":0.2}},\"type\":\"barpolar\"}],\"pie\":[{\"automargin\":true,\"type\":\"pie\"}]},\"layout\":{\"autotypenumbers\":\"strict\",\"colorway\":[\"#636efa\",\"#EF553B\",\"#00cc96\",\"#ab63fa\",\"#FFA15A\",\"#19d3f3\",\"#FF6692\",\"#B6E880\",\"#FF97FF\",\"#FECB52\"],\"font\":{\"color\":\"#f2f5fa\"},\"hovermode\":\"closest\",\"hoverlabel\":{\"align\":\"left\"},\"paper_bgcolor\":\"rgb(17,17,17)\",\"plot_bgcolor\":\"rgb(17,17,17)\",\"polar\":{\"bgcolor\":\"rgb(17,17,17)\",\"angularaxis\":{\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"ticks\":\"\"},\"radialaxis\":{\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"ticks\":\"\"}},\"ternary\":{\"bgcolor\":\"rgb(17,17,17)\",\"aaxis\":{\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"ticks\":\"\"},\"baxis\":{\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"ticks\":\"\"},\"caxis\":{\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"ticks\":\"\"}},\"coloraxis\":{\"colorbar\":{\"outlinewidth\":0,\"ticks\":\"\"}},\"colorscale\":{\"sequential\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"sequentialminus\":[[0.0,\"#0d0887\"],[0.1111111111111111,\"#46039f\"],[0.2222222222222222,\"#7201a8\"],[0.3333333333333333,\"#9c179e\"],[0.4444444444444444,\"#bd3786\"],[0.5555555555555556,\"#d8576b\"],[0.6666666666666666,\"#ed7953\"],[0.7777777777777778,\"#fb9f3a\"],[0.8888888888888888,\"#fdca26\"],[1.0,\"#f0f921\"]],\"diverging\":[[0,\"#8e0152\"],[0.1,\"#c51b7d\"],[0.2,\"#de77ae\"],[0.3,\"#f1b6da\"],[0.4,\"#fde0ef\"],[0.5,\"#f7f7f7\"],[0.6,\"#e6f5d0\"],[0.7,\"#b8e186\"],[0.8,\"#7fbc41\"],[0.9,\"#4d9221\"],[1,\"#276419\"]]},\"xaxis\":{\"gridcolor\":\"#283442\",\"linecolor\":\"#506784\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"#283442\",\"automargin\":true,\"zerolinewidth\":2},\"yaxis\":{\"gridcolor\":\"#283442\",\"linecolor\":\"#506784\",\"ticks\":\"\",\"title\":{\"standoff\":15},\"zerolinecolor\":\"#283442\",\"automargin\":true,\"zerolinewidth\":2},\"scene\":{\"xaxis\":{\"backgroundcolor\":\"rgb(17,17,17)\",\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"#C8D4E3\",\"gridwidth\":2},\"yaxis\":{\"backgroundcolor\":\"rgb(17,17,17)\",\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"#C8D4E3\",\"gridwidth\":2},\"zaxis\":{\"backgroundcolor\":\"rgb(17,17,17)\",\"gridcolor\":\"#506784\",\"linecolor\":\"#506784\",\"showbackground\":true,\"ticks\":\"\",\"zerolinecolor\":\"#C8D4E3\",\"gridwidth\":2}},\"shapedefaults\":{\"line\":{\"color\":\"#f2f5fa\"}},\"annotationdefaults\":{\"arrowcolor\":\"#f2f5fa\",\"arrowhead\":0,\"arrowwidth\":1},\"geo\":{\"bgcolor\":\"rgb(17,17,17)\",\"landcolor\":\"rgb(17,17,17)\",\"subunitcolor\":\"#506784\",\"showland\":true,\"showlakes\":true,\"lakecolor\":\"rgb(17,17,17)\"},\"title\":{\"x\":0.05},\"updatemenudefaults\":{\"bgcolor\":\"#506784\",\"borderwidth\":0},\"sliderdefaults\":{\"bgcolor\":\"#C8D4E3\",\"borderwidth\":1,\"bordercolor\":\"rgb(17,17,17)\",\"tickwidth\":0},\"mapbox\":{\"style\":\"dark\"}}},\"xaxis\":{\"anchor\":\"y\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"value\"}},\"yaxis\":{\"anchor\":\"x\",\"domain\":[0.0,1.0],\"title\":{\"text\":\"count\"}},\"legend\":{\"title\":{\"text\":\"variable\"},\"tracegroupgap\":0},\"margin\":{\"t\":60},\"barmode\":\"relative\"}, {\"responsive\": true} ).then(function(){\n",
1562
+ " \n",
1563
+ "var gd = document.getElementById('123c5b41-465a-466f-8bd4-1be7a1927f18');\n",
1564
+ "var x = new MutationObserver(function (mutations, observer) {{\n",
1565
+ " var display = window.getComputedStyle(gd).display;\n",
1566
+ " if (!display || display === 'none') {{\n",
1567
+ " console.log([gd, 'removed!']);\n",
1568
+ " Plotly.purge(gd);\n",
1569
+ " observer.disconnect();\n",
1570
+ " }}\n",
1571
+ "}});\n",
1572
+ "\n",
1573
+ "// Listen for the removal of the full notebook cells\n",
1574
+ "var notebookContainer = gd.closest('#notebook-container');\n",
1575
+ "if (notebookContainer) {{\n",
1576
+ " x.observe(notebookContainer, {childList: true});\n",
1577
+ "}}\n",
1578
+ "\n",
1579
+ "// Listen for the clearing of the current output cell\n",
1580
+ "var outputEl = gd.closest('.output');\n",
1581
+ "if (outputEl) {{\n",
1582
+ " x.observe(outputEl, {childList: true});\n",
1583
+ "}}\n",
1584
+ "\n",
1585
+ " }) }; }); </script> </div>"
1586
+ ]
1587
+ },
1588
+ "metadata": {},
1589
+ "output_type": "display_data"
1590
+ }
1591
+ ],
1592
+ "execution_count": 6
1593
+ },
1594
+ {
1595
+ "metadata": {
1596
+ "ExecuteTime": {
1597
+ "end_time": "2024-11-17T01:39:16.853090Z",
1598
+ "start_time": "2024-11-17T01:39:16.170397Z"
1599
+ }
1600
+ },
1601
+ "cell_type": "code",
1602
+ "source": [
1603
+ "from transformer_lens import utils\n",
1604
+ "from functools import partial\n",
1605
+ "\n",
1606
+ "\n",
1607
+ "# next we want to do a reconstruction test.\n",
1608
+ "def reconstr_hook(activation, hook, sae_out):\n",
1609
+ " return sae_out\n",
1610
+ "\n",
1611
+ "\n",
1612
+ "def zero_abl_hook(activation, hook):\n",
1613
+ " return torch.zeros_like(activation)\n",
1614
+ "\n",
1615
+ "\n",
1616
+ "print(\"Orig\", model(batch_tokens, return_type=\"loss\").item())\n",
1617
+ "print(\n",
1618
+ " \"reconstr\",\n",
1619
+ " model.run_with_hooks(\n",
1620
+ " batch_tokens,\n",
1621
+ " fwd_hooks=[\n",
1622
+ " (\n",
1623
+ " sae.cfg.hook_name,\n",
1624
+ " partial(reconstr_hook, sae_out=sae_out),\n",
1625
+ " )\n",
1626
+ " ],\n",
1627
+ " return_type=\"loss\",\n",
1628
+ " ).item(),\n",
1629
+ ")\n",
1630
+ "print(\n",
1631
+ " \"Zero\",\n",
1632
+ " model.run_with_hooks(\n",
1633
+ " batch_tokens,\n",
1634
+ " return_type=\"loss\",\n",
1635
+ " fwd_hooks=[(sae.cfg.hook_name, zero_abl_hook)],\n",
1636
+ " ).item(),\n",
1637
+ ")"
1638
+ ],
1639
+ "id": "ddabe8530685c45",
1640
+ "outputs": [
1641
+ {
1642
+ "name": "stdout",
1643
+ "output_type": "stream",
1644
+ "text": [
1645
+ "Orig 3.5622000694274902\n",
1646
+ "reconstr 3.764155387878418\n",
1647
+ "Zero 11.146590232849121\n"
1648
+ ]
1649
+ }
1650
+ ],
1651
+ "execution_count": 5
1652
+ },
1653
+ {
1654
+ "metadata": {
1655
+ "ExecuteTime": {
1656
+ "end_time": "2024-11-17T01:39:48.048784Z",
1657
+ "start_time": "2024-11-17T01:39:48.033476Z"
1658
+ }
1659
+ },
1660
+ "cell_type": "code",
1661
+ "source": "cfg_dict",
1662
+ "id": "f08540e9e717e9fe",
1663
+ "outputs": [
1664
+ {
1665
+ "data": {
1666
+ "text/plain": [
1667
+ "{'model_name': 'gpt2-small',\n",
1668
+ " 'hook_point': 'blocks.8.hook_resid_pre',\n",
1669
+ " 'hook_point_layer': 8,\n",
1670
+ " 'hook_point_head_index': None,\n",
1671
+ " 'dataset_path': 'Skylion007/openwebtext',\n",
1672
+ " 'is_dataset_tokenized': False,\n",
1673
+ " 'context_size': 128,\n",
1674
+ " 'use_cached_activations': False,\n",
1675
+ " 'cached_activations_path': 'activations/Skylion007_openwebtext/gpt2-small/blocks.8.hook_resid_pre',\n",
1676
+ " 'd_in': 768,\n",
1677
+ " 'n_batches_in_buffer': 128,\n",
1678
+ " 'total_training_tokens': 300000000,\n",
1679
+ " 'store_batch_size': 32,\n",
1680
+ " 'device': 'cuda',\n",
1681
+ " 'seed': 42,\n",
1682
+ " 'dtype': 'torch.float32',\n",
1683
+ " 'b_dec_init_method': 'geometric_median',\n",
1684
+ " 'expansion_factor': 32,\n",
1685
+ " 'from_pretrained_path': None,\n",
1686
+ " 'l1_coefficient': 8e-05,\n",
1687
+ " 'lr': 0.0004,\n",
1688
+ " 'lr_scheduler_name': None,\n",
1689
+ " 'lr_warm_up_steps': 5000,\n",
1690
+ " 'train_batch_size': 4096,\n",
1691
+ " 'use_ghost_grads': False,\n",
1692
+ " 'feature_sampling_window': 1000,\n",
1693
+ " 'feature_sampling_method': None,\n",
1694
+ " 'resample_batches': 1028,\n",
1695
+ " 'feature_reinit_scale': 0.2,\n",
1696
+ " 'dead_feature_window': 5000,\n",
1697
+ " 'dead_feature_estimation_method': 'no_fire',\n",
1698
+ " 'dead_feature_threshold': 1e-08,\n",
1699
+ " 'log_to_wandb': True,\n",
1700
+ " 'wandb_project': 'mats_sae_training_gpt2_small_resid_pre_5',\n",
1701
+ " 'wandb_entity': None,\n",
1702
+ " 'wandb_log_frequency': 100,\n",
1703
+ " 'n_checkpoints': 10,\n",
1704
+ " 'checkpoint_path': 'checkpoints/ut7lhl4q',\n",
1705
+ " 'd_sae': 24576,\n",
1706
+ " 'tokens_per_buffer': 67108864,\n",
1707
+ " 'run_name': '24576-L1-8e-05-LR-0.0004-Tokens-3.000e+08',\n",
1708
+ " 'model_from_pretrained_kwargs': {'center_writing_weights': True},\n",
1709
+ " 'neuronpedia_id': 'gpt2-small/8-res-jb',\n",
1710
+ " 'prepend_bos': True,\n",
1711
+ " 'dataset_trust_remote_code': True,\n",
1712
+ " 'apply_b_dec_to_input': True,\n",
1713
+ " 'finetuning_scaling_factor': False,\n",
1714
+ " 'sae_lens_training_version': None,\n",
1715
+ " 'activation_fn_str': 'relu',\n",
1716
+ " 'architecture': 'standard',\n",
1717
+ " 'normalize_activations': 'none'}"
1718
+ ]
1719
+ },
1720
+ "execution_count": 7,
1721
+ "metadata": {},
1722
+ "output_type": "execute_result"
1723
+ }
1724
+ ],
1725
+ "execution_count": 7
1726
+ },
1727
+ {
1728
+ "metadata": {
1729
+ "ExecuteTime": {
1730
+ "end_time": "2024-11-17T01:43:04.413424Z",
1731
+ "start_time": "2024-11-17T01:43:04.407561Z"
1732
+ }
1733
+ },
1734
+ "cell_type": "code",
1735
+ "source": "sae.W_dec.shape",
1736
+ "id": "5e92bb48ae9ab956",
1737
+ "outputs": [
1738
+ {
1739
+ "data": {
1740
+ "text/plain": [
1741
+ "torch.Size([24576, 768])"
1742
+ ]
1743
+ },
1744
+ "execution_count": 13,
1745
+ "metadata": {},
1746
+ "output_type": "execute_result"
1747
+ }
1748
+ ],
1749
+ "execution_count": 13
1750
+ },
1751
+ {
1752
+ "metadata": {},
1753
+ "cell_type": "code",
1754
+ "outputs": [],
1755
+ "execution_count": null,
1756
+ "source": "",
1757
+ "id": "ab4398bacf9ee3bc"
1758
+ }
1759
+ ],
1760
+ "metadata": {
1761
+ "kernelspec": {
1762
+ "display_name": "Python 3",
1763
+ "language": "python",
1764
+ "name": "python3"
1765
+ },
1766
+ "language_info": {
1767
+ "codemirror_mode": {
1768
+ "name": "ipython",
1769
+ "version": 2
1770
+ },
1771
+ "file_extension": ".py",
1772
+ "mimetype": "text/x-python",
1773
+ "name": "python",
1774
+ "nbconvert_exporter": "python",
1775
+ "pygments_lexer": "ipython2",
1776
+ "version": "2.7.6"
1777
+ }
1778
+ },
1779
+ "nbformat": 4,
1780
+ "nbformat_minor": 5
1781
+ }
nnsight_gemma_steering_file.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from contextlib import contextmanager
3
+
4
+ from nnsight import LanguageModel
5
+ import torch
6
+ #from transformer_lens import HookedTransformer, utils
7
+
8
+ from sae_lens import SAE
9
+
10
+ device = "cuda"
11
+
12
+ sae_20, _, _ = SAE.from_pretrained(
13
+ release = "gemma-scope-2b-pt-res-canonical",
14
+ sae_id = "layer_20/width_16k/canonical",
15
+ device=device
16
+ )
17
+ sae_10, _, _ = SAE.from_pretrained(
18
+ release = "gemma-scope-2b-pt-res-canonical",
19
+ sae_id = "layer_10/width_16k/canonical",
20
+ device=device
21
+ )
22
+
23
+ sae_4, _, _ = SAE.from_pretrained(
24
+ release = "gemma-scope-2b-pt-res-canonical",
25
+ sae_id = "layer_4/width_16k/canonical",
26
+ device=device
27
+ )
28
+ sae_25, _, _ = SAE.from_pretrained(
29
+ release = "gemma-scope-2b-pt-res-canonical",
30
+ sae_id = "layer_25/width_16k/canonical",
31
+ device=device
32
+ )
33
+ feature_dict = {
34
+ "dog": {
35
+ "sae": sae_20,
36
+ "index": 12082
37
+ },
38
+ "harry potter4": {
39
+ "sae": sae_4,
40
+ "index": 12445
41
+ },
42
+ "harry potter10": {
43
+ "sae": sae_10,
44
+ "index": 6520
45
+ },
46
+ "kindness": {
47
+ "sae": sae_25,
48
+ "index": 10092
49
+ },
50
+ "yelling": {
51
+ "sae": sae_20,
52
+ "index": 11859
53
+ }
54
+ }
55
+ llm = LanguageModel(
56
+ "google/gemma-2-2b-it",
57
+ # dtype=torch.bfloat16,
58
+ # default_padding_side="left",
59
+ device_map="cuda:0",
60
+ )
61
+ # "meta-llama/Llama-3.2-1B-Instruct",#
62
+
63
+ batched_chat = [
64
+ [
65
+ {"role": "user",
66
+ "content": "What book is Hermione Granger from?"}
67
+ ]
68
+ ]
69
+
70
+ tokens = llm.tokenizer.apply_chat_template(batched_chat,
71
+ padding=True,
72
+ tokenize=True,
73
+ return_tensors="pt",
74
+ add_generation_prompt=True
75
+ )
76
+
77
+
78
+ feature = feature_dict["harry potter4"]
79
+ strength = -5
80
+ steering_vector = feature["sae"].W_dec[feature["index"]] * strength
81
+
82
+ with llm.generate(tokens, temperature=1, max_new_tokens=128) as tracer:
83
+ for i in range(len(llm.model.layers)):
84
+ module_name = "post_attention_layernorm"
85
+ module = getattr(llm.model.layers[i], module_name)
86
+
87
+ resid_pre_before = module.output.clone().save()
88
+ module.output[:] = resid_pre_before + steering_vector
89
+
90
+ resid_pre_after = module.output.save()
91
+
92
+ # module.next()
93
+
94
+ output = llm.generator.output.save()
95
+
96
+ # print("output tensors:", output)
97
+ print("output string:", llm.tokenizer.batch_decode(output.tolist(), skip_special_tokens=False)[0])
98
+ # print("Before:", resid_pre_before)
99
+ # print("After:", resid_pre_after)
tlens_gemma_steering.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from prometheus_client.decorator import contextmanager
4
+ from tqdm import tqdm
5
+ import plotly.express as px
6
+ from datasets import load_dataset
7
+ from transformer_lens import HookedTransformer, utils
8
+ from functools import partial
9
+ from sae_lens import SAE
10
+ from contextlib import contextmanager
11
+ device = "cuda"
12
+
13
+ from sae_lens import SAE # pip install sae-lens
14
+
15
+ sae, cfg_dict, sparsity = SAE.from_pretrained(
16
+ release = "gemma-scope-2b-pt-res-canonical",
17
+ sae_id = "layer_20/width_16k/canonical",
18
+ device=device
19
+ )
20
+
21
+ sae_10, _, _ = SAE.from_pretrained(
22
+ release = "gemma-scope-2b-pt-res-canonical",
23
+ sae_id = "layer_10/width_16k/canonical",
24
+ device=device
25
+ )
26
+
27
+ sae_4, _, _ = SAE.from_pretrained(
28
+ release = "gemma-scope-2b-pt-res-canonical",
29
+ sae_id = "layer_4/width_16k/canonical",
30
+ device=device
31
+ )
32
+
33
+ model = HookedTransformer.from_pretrained_no_processing(
34
+ model_name="google/gemma-2-2b-it",
35
+ device=device,
36
+ dtype=torch.bfloat16,
37
+ default_padding_side="left"
38
+ )
39
+ layer = 20
40
+ sae.eval()
41
+
42
+ feature_dict = {
43
+ "dog": {
44
+ "sae": sae,
45
+ "index": 12082
46
+ },
47
+ "harry potter4": {
48
+ "sae": sae_4,
49
+ "index": 12445
50
+ },
51
+ "harry potter10": {
52
+ "sae": sae_10,
53
+ "index": 6520
54
+ }
55
+ }
56
+
57
+ def sae_hook(activation, hook, subject, strength):
58
+ feature = feature_dict[subject]
59
+ steering_vector = feature["sae"].W_dec[feature["index"]] * strength
60
+ return activation + steering_vector
61
+
62
+
63
+ @contextmanager
64
+ def steering(subject, strength):
65
+ layers = list(range(model.cfg.n_layers))
66
+ for layer in layers:
67
+ model.add_hook(
68
+ utils.get_act_name('resid_pre', layer),
69
+ partial(sae_hook, subject=subject, strength=strength)
70
+ )
71
+
72
+ yield
73
+
74
+ model.reset_hooks()
75
+
76
+
77
+ batched_chat = [
78
+ [
79
+ {"role": "user",
80
+ "content": "What book is Hermione from?"}
81
+ ]
82
+ ]
83
+
84
+ tokens = model.tokenizer.apply_chat_template(
85
+ batched_chat,
86
+ padding=True,
87
+ tokenize=True,
88
+ return_tensors="pt"
89
+ )
90
+ print(tokens)
91
+
92
+ for i in range(2):
93
+ if i == 0:
94
+ print("steering")
95
+ with steering(subject="harry potter10", strength=-5):
96
+ with torch.set_grad_enabled(False):
97
+ batch_output = model.generate(tokens, max_new_tokens=256)
98
+ response_tokens = []
99
+ for prompt, combined in zip(tokens, batch_output):
100
+ response = combined[len(prompt):]
101
+ response_tokens.append(response)
102
+
103
+ responses = model.tokenizer.batch_decode(response_tokens, skip_special_tokens=True)
104
+
105
+ else:
106
+ print("no steering")
107
+ with torch.set_grad_enabled(False):
108
+ batch_output = model.generate(tokens, max_new_tokens=256)
109
+ response_tokens = []
110
+ for prompt, combined in zip(tokens, batch_output):
111
+ response = combined[len(prompt):]
112
+ response_tokens.append(response)
113
+
114
+ responses = model.tokenizer.batch_decode(response_tokens, skip_special_tokens=True)
115
+
116
+ print(responses[0])