make tools as list in chat_template + update readme

#17

by Jofthomas - opened Apr 19, 2024

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+50

-1

Files changed (2) hide show

README.md +49 -0
tokenizer_config.json +1 -1

README.md CHANGED Viewed

@@ -69,6 +69,55 @@ sp_tokenizer = tokenizer_v3.instruct_tokenizer.tokenizer
 decoded = sp_tokenizer.decode(generated_ids[0])
 print(decoded)
 ```
 # Instruct tokenizer
 The HuggingFace tokenizer included in this release should match our own. To compare:

 decoded = sp_tokenizer.decode(generated_ids[0])
 print(decoded)
 ```
+Alternatively, you can run this example with the Hugging Face tokenizer.
+To use this example, you'll need transformers version 4.39.0 or higher.
+```console
+pip install transformers==4.39.0
+```
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_id = "mistralai/Mixtral-8x22B-Instruct-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+conversation=[
+    {"role": "user", "content": "What's the weather like in Paris?"},
+    {
+        "role": "tool_calls",
+        "content": [
+            {
+                "name": "get_current_weather",
+                "arguments": {"location": "Paris, France", "format": "celsius"},
+            }
+        ]
+    },
+    {
+        "role": "tool_results",
+        "content": {"content": 22}
+    },
+    {"role": "assistant", "content": "The current temperature in Paris, France is 22 degrees Celsius."},
+    {"role": "user", "content": "What about San Francisco?"}
+]
+tools = [{"type": "function", "function": {"name":"get_current_weather", "description": "Get▁the▁current▁weather", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "The temperature unit to use. Infer this from the users location."}},"required":["location","format"]}}}]
+# render the tool use prompt as a string:
+tool_use_prompt = tokenizer.apply_chat_template(
+            conversation,
+            chat_template="tool_use",
+            tools=tools,
+            tokenize=False,
+            add_generation_prompt=True,
+)
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x22B-Instruct-v0.1")
+inputs = tokenizer(tool_use_prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=20)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
 # Instruct tokenizer
 The HuggingFace tokenizer included in this release should match our own. To compare:

tokenizer_config.json CHANGED Viewed

@@ -36,7 +36,7 @@
   },
   {
     "name": "tool_use",
-    "template": "{{bos_token}}{% set user_messages = messages | selectattr('role', 'equalto', 'user') | list %}{% for message in messages %}{% if message['role'] == 'user' %}{% if message == user_messages[-1] %}{{ '[AVAILABLE_TOOLS]'}}{% for tool in tools %}{{ tool }}{% endfor %}{{ '[/AVAILABLE_TOOLS]'}}{{ '[INST]' + message['content'] + '[/INST]' }}{% else %}{{ '[INST]' + message['content'] + '[/INST]' }}{% endif %}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% elif message['role'] == 'tool_results' %}{{'[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]'}}{% elif message['role'] == 'tool_calls' %}{{'[TOOL_CALLS]' + message['content']|string + eos_token}}{% endif %}{% endfor %}"
   }
   ],
   "clean_up_tokenization_spaces": false,

   },
   {
     "name": "tool_use",
+    "template": "{{bos_token}}{% set user_messages = messages | selectattr('role', 'equalto', 'user') | list %}{% for message in messages %}{% if message['role'] == 'user' %}{% if message == user_messages[-1] %}{% if tools %}{{'[AVAILABLE_TOOLS]'+ tools|string + '[/AVAILABLE_TOOLS]'}}{% endif %}{{ '[INST]' + message['content'] + '[/INST]' }}{% else %}{{ '[INST]' + message['content'] + '[/INST]' }}{% endif %}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% elif message['role'] == 'tool_results' %}{{'[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]'}}{% elif message['role'] == 'tool_calls' %}{{'[TOOL_CALLS]' + message['content']|string + eos_token}}{% endif %}{% endfor %}"
   }
   ],
   "clean_up_tokenization_spaces": false,