mikasenghaas commited on
Commit
b02b805
·
unverified ·
1 Parent(s): e433136

Show message dict and og tokenizer

Browse files
Files changed (1) hide show
  1. test_template.py +118 -132
test_template.py CHANGED
@@ -7,201 +7,183 @@
7
  from transformers import AutoTokenizer
8
 
9
 
10
- def print_section(title, content):
11
  """Helper function to print formatted sections"""
12
  print(f"\n{'=' * 60}")
13
  print(f"{title}")
14
  print(f"{'=' * 60}")
15
- print(content)
16
- print()
 
 
 
 
 
17
 
18
 
19
  # Initialize tokenizer
20
- tok = AutoTokenizer.from_pretrained(".")
 
 
21
 
22
  # Only user message
23
  print_section(
24
  "User message only",
25
- tok.apply_chat_template(
26
- [{"role": "user", "content": "What is the capital of France?"}], tokenize=False
27
- ),
28
  )
29
 
30
  # User message with generation prompt
31
  print_section(
32
  "User message with generation prompt",
33
- tok.apply_chat_template(
34
- [
35
- {"role": "user", "content": "What is the capital of France?"},
36
- ],
37
- tokenize=False,
38
- add_generation_prompt=True,
39
- ),
40
  )
41
 
42
  # User message with custom system message
43
  print_section(
44
  "Custom system message",
45
- tok.apply_chat_template(
46
- [
47
- {"role": "system", "content": "You are a helpful assistant."},
48
- {"role": "user", "content": "What is the capital of France?"},
49
- ],
50
- tokenize=False,
51
- ),
52
  )
53
 
54
  # Single-turn with assistant response (no think)
55
  print_section(
56
  "Single-turn with assistant response (no think)",
57
- tok.apply_chat_template(
58
- [
59
- {"role": "user", "content": "What is the capital of France?"},
60
- {"role": "assistant", "content": "The capital of France is Paris."},
61
- ],
62
- tokenize=False,
63
- ),
64
  )
65
 
66
  # Single-turn with think embedded in content
67
  print_section(
68
  "Single-turn with think embedded in content",
69
- tok.apply_chat_template(
70
- [
71
- {"role": "user", "content": "What is the capital of France?"},
72
- {
73
- "role": "assistant",
74
- "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
75
- },
76
- ],
77
- tokenize=False,
78
- ),
79
  )
80
 
81
  # Single-turn with reasoning_content field
82
  print_section(
83
  "Single-turn with reasoning_content field",
84
- tok.apply_chat_template(
85
- [
86
- {"role": "user", "content": "What is the capital of France?"},
87
- {
88
- "role": "assistant",
89
- "content": "The capital of France is Paris.",
90
- "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
91
- },
92
- ],
93
- tokenize=False,
94
- ),
95
  )
96
 
97
  print_section(
98
  "Single-turn with think section and reasoning_content field",
99
- tok.apply_chat_template(
100
- [
101
- {"role": "user", "content": "What is the capital of France?"},
102
- {
103
- "role": "assistant",
104
- "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
105
- "reasoning_content": "This should not be visible.",
106
- },
107
- ],
108
- tokenize=False,
109
- ),
110
  )
111
 
112
 
113
  # Multi-turn and assistant response with think sections (embedded in content)
114
  print_section(
115
  "Multi-turn with think embedded in content",
116
- tok.apply_chat_template(
117
- [
118
- {"role": "user", "content": "What is the capital of France?"},
119
- {
120
- "role": "assistant",
121
- "content": "<think>This is a basic geography question.</think>\nThe capital of France is Paris.",
122
- },
123
- {"role": "user", "content": "What about Germany?"},
124
- {
125
- "role": "assistant",
126
- "content": "<think>Another geography question. Germany's capital is Berlin.</think>\nThe capital of Germany is Berlin.",
127
- },
128
- ],
129
- tokenize=False,
130
- ),
131
  )
132
 
133
  # Multi-turn and assistant response with think sections (embedded in content)
134
  print_section(
135
  "Multi-turn with reasoning_content field",
136
- tok.apply_chat_template(
137
- [
138
- {"role": "user", "content": "What is the capital of France?"},
139
- {
140
- "role": "assistant",
141
- "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
142
- "content": "The capital of France is Paris.",
143
- },
144
- {"role": "user", "content": "What about Germany?"},
145
- {
146
- "role": "assistant",
147
- "reasoning_content": "Another geography question. Germany's capital is Berlin.",
148
- "content": "The capital of Germany is Berlin.",
149
- },
150
- ],
151
- tokenize=False,
152
- ),
153
  )
154
 
155
  # Assistant with only think section, no visible content
156
  print_section(
157
  "Assistant with only think section",
158
- tok.apply_chat_template(
159
- [
160
- {
161
- "role": "user",
162
- "content": "Think about this problem but don't respond yet.",
163
- },
164
- {
165
- "role": "assistant",
166
- "content": "<think>The user wants me to think about something but not provide a response yet. I should just show my thinking process without any visible output.</think>",
167
- },
168
- ],
169
- tokenize=False,
170
- ),
171
  )
172
 
173
  # Assistant with unfinished think section
174
  print_section(
175
  "Assistant with unfinished think section",
176
- tok.apply_chat_template(
177
- [
178
- {
179
- "role": "user",
180
- "content": "Think about this problem but don't respond yet.",
181
- },
182
- {
183
- "role": "assistant",
184
- "content": "<think>The user wants me to think about something but not provide a response yet. I should just",
185
- },
186
- ],
187
- tokenize=False,
188
- ),
189
  )
190
 
191
  print_section(
192
  "Empty reasoning content",
193
- tok.apply_chat_template(
194
- [
195
- {"role": "system", "content": "You are a helpful assistant."},
196
- {"role": "user", "content": "Say hello"},
197
- {
198
- "role": "assistant",
199
- "content": "Hello! How can I help you today?",
200
- "reasoning_content": "",
201
- },
202
- ],
203
- tokenize=False,
204
- ),
205
  )
206
 
207
 
@@ -255,7 +237,9 @@ tools = [
255
 
256
  print_section(
257
  "Single-turn tool use with weather",
258
- tok.apply_chat_template(tool_example, tokenize=False, tools=tools),
 
 
259
  )
260
 
261
  # ============================================================================
@@ -312,5 +296,7 @@ multi_tools = [
312
 
313
  print_section(
314
  "Single-turn with multiple tool calls",
315
- tok.apply_chat_template(multi_tool_example, tokenize=False, tools=multi_tools),
 
 
316
  )
 
7
  from transformers import AutoTokenizer
8
 
9
 
10
+ def print_section(title, messages, tokenizers, **tokenizer_kwargs):
11
  """Helper function to print formatted sections"""
12
  print(f"\n{'=' * 60}")
13
  print(f"{title}")
14
  print(f"{'=' * 60}")
15
+ print(f"\n{messages=}\n")
16
+ for tokenizer_name, tokenizer in tokenizers.items():
17
+ print(f"\n{tokenizer_name=}\n")
18
+ content = tokenizer.apply_chat_template(
19
+ messages, tokenize=False, **tokenizer_kwargs
20
+ )
21
+ print(content)
22
 
23
 
24
  # Initialize tokenizer
25
+ local_tokenizer = AutoTokenizer.from_pretrained(".")
26
+ glm_tokenizer = AutoTokenizer.from_pretrained("zai-org/GLM-4.5-Air")
27
+ tokenizers = {"Local": local_tokenizer, "GLM-4.5-Air": glm_tokenizer}
28
 
29
  # Only user message
30
  print_section(
31
  "User message only",
32
+ [{"role": "user", "content": "What is the capital of France?"}],
33
+ tokenizers,
 
34
  )
35
 
36
  # User message with generation prompt
37
  print_section(
38
  "User message with generation prompt",
39
+ [{"role": "user", "content": "What is the capital of France?"}],
40
+ tokenizers,
41
+ add_generation_prompt=True,
 
 
 
 
42
  )
43
 
44
  # User message with custom system message
45
  print_section(
46
  "Custom system message",
47
+ [
48
+ {"role": "system", "content": "You are a helpful assistant."},
49
+ {"role": "user", "content": "What is the capital of France?"},
50
+ ],
51
+ tokenizers,
 
 
52
  )
53
 
54
  # Single-turn with assistant response (no think)
55
  print_section(
56
  "Single-turn with assistant response (no think)",
57
+ [
58
+ {"role": "user", "content": "What is the capital of France?"},
59
+ {"role": "assistant", "content": "The capital of France is Paris."},
60
+ ],
61
+ tokenizers,
 
 
62
  )
63
 
64
  # Single-turn with think embedded in content
65
  print_section(
66
  "Single-turn with think embedded in content",
67
+ [
68
+ {"role": "user", "content": "What is the capital of France?"},
69
+ {
70
+ "role": "assistant",
71
+ "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
72
+ },
73
+ ],
74
+ tokenizers,
 
 
75
  )
76
 
77
  # Single-turn with reasoning_content field
78
  print_section(
79
  "Single-turn with reasoning_content field",
80
+ [
81
+ {"role": "user", "content": "What is the capital of France?"},
82
+ {
83
+ "role": "assistant",
84
+ "content": "The capital of France is Paris.",
85
+ "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
86
+ },
87
+ ],
88
+ tokenizers,
 
 
89
  )
90
 
91
  print_section(
92
  "Single-turn with think section and reasoning_content field",
93
+ [
94
+ {"role": "user", "content": "What is the capital of France?"},
95
+ {
96
+ "role": "assistant",
97
+ "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
98
+ "reasoning_content": "This should not be visible.",
99
+ },
100
+ ],
101
+ tokenizers,
 
 
102
  )
103
 
104
 
105
  # Multi-turn and assistant response with think sections (embedded in content)
106
  print_section(
107
  "Multi-turn with think embedded in content",
108
+ [
109
+ {"role": "user", "content": "What is the capital of France?"},
110
+ {
111
+ "role": "assistant",
112
+ "content": "<think>This is a basic geography question.</think>\nThe capital of France is Paris.",
113
+ },
114
+ {"role": "user", "content": "What about Germany?"},
115
+ {
116
+ "role": "assistant",
117
+ "content": "<think>Another geography question. Germany's capital is Berlin.</think>\nThe capital of Germany is Berlin.",
118
+ },
119
+ ],
120
+ tokenizers,
 
 
121
  )
122
 
123
  # Multi-turn and assistant response with think sections (embedded in content)
124
  print_section(
125
  "Multi-turn with reasoning_content field",
126
+ [
127
+ {"role": "user", "content": "What is the capital of France?"},
128
+ {
129
+ "role": "assistant",
130
+ "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
131
+ "content": "The capital of France is Paris.",
132
+ },
133
+ {"role": "user", "content": "What about Germany?"},
134
+ {
135
+ "role": "assistant",
136
+ "reasoning_content": "Another geography question. Germany's capital is Berlin.",
137
+ "content": "The capital of Germany is Berlin.",
138
+ },
139
+ ],
140
+ tokenizers,
 
 
141
  )
142
 
143
  # Assistant with only think section, no visible content
144
  print_section(
145
  "Assistant with only think section",
146
+ [
147
+ {
148
+ "role": "user",
149
+ "content": "Think about this problem but don't respond yet.",
150
+ },
151
+ {
152
+ "role": "assistant",
153
+ "content": "<think>The user wants me to think about something but not provide a response yet. I should just show my thinking process without any visible output.</think>",
154
+ },
155
+ ],
156
+ tokenizers,
 
 
157
  )
158
 
159
  # Assistant with unfinished think section
160
  print_section(
161
  "Assistant with unfinished think section",
162
+ [
163
+ {
164
+ "role": "user",
165
+ "content": "Think about this problem but don't respond yet.",
166
+ },
167
+ {
168
+ "role": "assistant",
169
+ "content": "<think>The user wants me to think about something but not provide a response yet. I should just",
170
+ },
171
+ ],
172
+ tokenizers,
 
 
173
  )
174
 
175
  print_section(
176
  "Empty reasoning content",
177
+ [
178
+ {"role": "system", "content": "You are a helpful assistant."},
179
+ {"role": "user", "content": "Say hello"},
180
+ {
181
+ "role": "assistant",
182
+ "content": "Hello! How can I help you today?",
183
+ "reasoning_content": "",
184
+ },
185
+ ],
186
+ tokenizers,
 
 
187
  )
188
 
189
 
 
237
 
238
  print_section(
239
  "Single-turn tool use with weather",
240
+ tool_example,
241
+ tokenizers,
242
+ tools=tools,
243
  )
244
 
245
  # ============================================================================
 
296
 
297
  print_section(
298
  "Single-turn with multiple tool calls",
299
+ multi_tool_example,
300
+ tokenizers,
301
+ tools=multi_tools,
302
  )