make tools as list in chat_template + update readme
#17
by
Jofthomas
- opened
- README.md +49 -0
- tokenizer_config.json +1 -1
README.md
CHANGED
|
@@ -69,6 +69,55 @@ sp_tokenizer = tokenizer_v3.instruct_tokenizer.tokenizer
|
|
| 69 |
decoded = sp_tokenizer.decode(generated_ids[0])
|
| 70 |
print(decoded)
|
| 71 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# Instruct tokenizer
|
| 74 |
The HuggingFace tokenizer included in this release should match our own. To compare:
|
|
|
|
| 69 |
decoded = sp_tokenizer.decode(generated_ids[0])
|
| 70 |
print(decoded)
|
| 71 |
```
|
| 72 |
+
Alternatively, you can run this example with the Hugging Face tokenizer.
|
| 73 |
+
To use this example, you'll need transformers version 4.39.0 or higher.
|
| 74 |
+
```console
|
| 75 |
+
pip install transformers==4.39.0
|
| 76 |
+
```
|
| 77 |
+
```python
|
| 78 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 79 |
+
|
| 80 |
+
model_id = "mistralai/Mixtral-8x22B-Instruct-v0.1"
|
| 81 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 82 |
+
conversation=[
|
| 83 |
+
{"role": "user", "content": "What's the weather like in Paris?"},
|
| 84 |
+
{
|
| 85 |
+
"role": "tool_calls",
|
| 86 |
+
"content": [
|
| 87 |
+
{
|
| 88 |
+
"name": "get_current_weather",
|
| 89 |
+
"arguments": {"location": "Paris, France", "format": "celsius"},
|
| 90 |
+
|
| 91 |
+
}
|
| 92 |
+
]
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"role": "tool_results",
|
| 96 |
+
"content": {"content": 22}
|
| 97 |
+
},
|
| 98 |
+
{"role": "assistant", "content": "The current temperature in Paris, France is 22 degrees Celsius."},
|
| 99 |
+
{"role": "user", "content": "What about San Francisco?"}
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
tools = [{"type": "function", "function": {"name":"get_current_weather", "description": "Get▁the▁current▁weather", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "The temperature unit to use. Infer this from the users location."}},"required":["location","format"]}}}]
|
| 104 |
+
|
| 105 |
+
# render the tool use prompt as a string:
|
| 106 |
+
tool_use_prompt = tokenizer.apply_chat_template(
|
| 107 |
+
conversation,
|
| 108 |
+
chat_template="tool_use",
|
| 109 |
+
tools=tools,
|
| 110 |
+
tokenize=False,
|
| 111 |
+
add_generation_prompt=True,
|
| 112 |
+
|
| 113 |
+
)
|
| 114 |
+
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x22B-Instruct-v0.1")
|
| 115 |
+
|
| 116 |
+
inputs = tokenizer(tool_use_prompt, return_tensors="pt")
|
| 117 |
+
|
| 118 |
+
outputs = model.generate(**inputs, max_new_tokens=20)
|
| 119 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 120 |
+
```
|
| 121 |
|
| 122 |
# Instruct tokenizer
|
| 123 |
The HuggingFace tokenizer included in this release should match our own. To compare:
|
tokenizer_config.json
CHANGED
|
@@ -36,7 +36,7 @@
|
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"name": "tool_use",
|
| 39 |
-
"template": "{{bos_token}}{% set user_messages = messages | selectattr('role', 'equalto', 'user') | list %}{% for message in messages %}{% if message['role'] == 'user' %}{% if message == user_messages[-1] %}{
|
| 40 |
}
|
| 41 |
],
|
| 42 |
"clean_up_tokenization_spaces": false,
|
|
|
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"name": "tool_use",
|
| 39 |
+
"template": "{{bos_token}}{% set user_messages = messages | selectattr('role', 'equalto', 'user') | list %}{% for message in messages %}{% if message['role'] == 'user' %}{% if message == user_messages[-1] %}{% if tools %}{{'[AVAILABLE_TOOLS]'+ tools|string + '[/AVAILABLE_TOOLS]'}}{% endif %}{{ '[INST]' + message['content'] + '[/INST]' }}{% else %}{{ '[INST]' + message['content'] + '[/INST]' }}{% endif %}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% elif message['role'] == 'tool_results' %}{{'[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]'}}{% elif message['role'] == 'tool_calls' %}{{'[TOOL_CALLS]' + message['content']|string + eos_token}}{% endif %}{% endfor %}"
|
| 40 |
}
|
| 41 |
],
|
| 42 |
"clean_up_tokenization_spaces": false,
|