add chat_template (#4)
Browse files- add chat_template (373b65c403f3cfc50e2d8f0a3b0b0353078cd005)
Co-authored-by: one dozon <[email protected]>
- README.md +1 -1
- tokenization_chatglm.py +32 -7
- tokenizer_config.json +19 -0
README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
```python
|
| 5 |
>>> from transformers import AutoTokenizer, AutoModel
|
| 6 |
>>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|
| 7 |
-
>>> model = AutoModel.from_pretrained("
|
| 8 |
>>> model = model.eval()
|
| 9 |
>>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
|
| 10 |
>>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
|
|
|
|
| 4 |
```python
|
| 5 |
>>> from transformers import AutoTokenizer, AutoModel
|
| 6 |
>>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|
| 7 |
+
>>> model = AutoModel.from_pretrained("thu-coai/CharacterGLM-6b", trust_remote_code=True, device='cuda')
|
| 8 |
>>> model = model.eval()
|
| 9 |
>>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
|
| 10 |
>>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
|
tokenization_chatglm.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
from typing import List, Optional, Union, Dict
|
| 4 |
from sentencepiece import SentencePieceProcessor
|
| 5 |
from transformers import PreTrainedTokenizer
|
|
@@ -27,9 +27,22 @@ class SPTokenizer:
|
|
| 27 |
self.special_tokens[token] = self.n_words
|
| 28 |
self.index_special_tokens[self.n_words] = token
|
| 29 |
self.n_words += 1
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
| 35 |
assert type(s) is str
|
|
@@ -41,7 +54,18 @@ class SPTokenizer:
|
|
| 41 |
return t
|
| 42 |
|
| 43 |
def decode(self, t: List[int]) -> str:
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def decode_tokens(self, tokens: List[str]) -> str:
|
| 47 |
text = self.sp_model.DecodePieces(tokens)
|
|
@@ -65,7 +89,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 65 |
|
| 66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
| 67 |
|
| 68 |
-
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
| 69 |
self.name = "GLMTokenizer"
|
| 70 |
|
| 71 |
self.vocab_file = vocab_file
|
|
@@ -75,6 +99,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 75 |
"<eos>": self.tokenizer.eos_id,
|
| 76 |
"<pad>": self.tokenizer.pad_id
|
| 77 |
}
|
|
|
|
| 78 |
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
| 79 |
|
| 80 |
def get_command(self, token):
|
|
@@ -110,7 +135,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 110 |
return vocab
|
| 111 |
|
| 112 |
def _tokenize(self, text, **kwargs):
|
| 113 |
-
return self.tokenizer.tokenize(text)
|
| 114 |
|
| 115 |
def _convert_token_to_id(self, token):
|
| 116 |
""" Converts a token (str) in an id using the vocab. """
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
from typing import List, Optional, Union, Dict
|
| 4 |
from sentencepiece import SentencePieceProcessor
|
| 5 |
from transformers import PreTrainedTokenizer
|
|
|
|
| 27 |
self.special_tokens[token] = self.n_words
|
| 28 |
self.index_special_tokens[self.n_words] = token
|
| 29 |
self.n_words += 1
|
| 30 |
+
self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
|
| 31 |
+
|
| 32 |
+
def tokenize(self, s: str, encode_special_tokens=False):
|
| 33 |
+
if encode_special_tokens:
|
| 34 |
+
last_index = 0
|
| 35 |
+
t = []
|
| 36 |
+
for match in re.finditer(self.role_special_token_expression, s):
|
| 37 |
+
if last_index < match.start():
|
| 38 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
|
| 39 |
+
t.append(s[match.start():match.end()])
|
| 40 |
+
last_index = match.end()
|
| 41 |
+
if last_index < len(s):
|
| 42 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
|
| 43 |
+
return t
|
| 44 |
+
else:
|
| 45 |
+
return self.sp_model.EncodeAsPieces(s)
|
| 46 |
|
| 47 |
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
| 48 |
assert type(s) is str
|
|
|
|
| 54 |
return t
|
| 55 |
|
| 56 |
def decode(self, t: List[int]) -> str:
|
| 57 |
+
text, buffer = "", []
|
| 58 |
+
for token in t:
|
| 59 |
+
if token in self.index_special_tokens:
|
| 60 |
+
if buffer:
|
| 61 |
+
text += self.sp_model.decode(buffer)
|
| 62 |
+
buffer = []
|
| 63 |
+
text += self.index_special_tokens[token]
|
| 64 |
+
else:
|
| 65 |
+
buffer.append(token)
|
| 66 |
+
if buffer:
|
| 67 |
+
text += self.sp_model.decode(buffer)
|
| 68 |
+
return text
|
| 69 |
|
| 70 |
def decode_tokens(self, tokens: List[str]) -> str:
|
| 71 |
text = self.sp_model.DecodePieces(tokens)
|
|
|
|
| 89 |
|
| 90 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
| 91 |
|
| 92 |
+
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, **kwargs):
|
| 93 |
self.name = "GLMTokenizer"
|
| 94 |
|
| 95 |
self.vocab_file = vocab_file
|
|
|
|
| 99 |
"<eos>": self.tokenizer.eos_id,
|
| 100 |
"<pad>": self.tokenizer.pad_id
|
| 101 |
}
|
| 102 |
+
self.encode_special_tokens = encode_special_tokens
|
| 103 |
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
| 104 |
|
| 105 |
def get_command(self, token):
|
|
|
|
| 135 |
return vocab
|
| 136 |
|
| 137 |
def _tokenize(self, text, **kwargs):
|
| 138 |
+
return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
|
| 139 |
|
| 140 |
def _convert_token_to_id(self, token):
|
| 141 |
""" Converts a token (str) in an id using the vocab. """
|
tokenizer_config.json
CHANGED
|
@@ -1,10 +1,29 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"auto_map": {
|
| 3 |
"AutoTokenizer": [
|
| 4 |
"tokenization_chatglm.ChatGLMTokenizer",
|
| 5 |
null
|
| 6 |
]
|
| 7 |
},
|
|
|
|
| 8 |
"clean_up_tokenization_spaces": true,
|
| 9 |
"do_lower_case": false,
|
| 10 |
"model_max_length": 1000000000000000019884624838656,
|
|
|
|
| 1 |
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"64790": {
|
| 4 |
+
"content": "[gMASK]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": true,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": false
|
| 10 |
+
},
|
| 11 |
+
"64792": {
|
| 12 |
+
"content": "sop",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": true,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": false
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
"auto_map": {
|
| 21 |
"AutoTokenizer": [
|
| 22 |
"tokenization_chatglm.ChatGLMTokenizer",
|
| 23 |
null
|
| 24 |
]
|
| 25 |
},
|
| 26 |
+
"chat_template": "{% set ns = namespace() %}[gMASK]sop{% for message in messages %}{% if loop.first %}{% set ns.bot_name = message['bot_name'] %}{% set ns.user_name = message['user_name'] %}以下是一段{{ message['bot_name'] }}和{{ message['user_name'] }}之间的对话。{%+ if message['bot_profile'] is defined and message['bot_profile']|length +%}\n关于{{ message['bot_name'] }}的信息:{{ message['bot_profile']|replace('\n', ' ') }}{% endif %}{%+ if message['user_profile'] is defined and message['user_profile']|length +%}\n关于{{ message['user_name'] }}的信息:{{ message['user_profile']|replace('\n', ' ') }}{% endif %}{%+ else +%}\n[{% if message['role'] == 'user' %}{{ ns.user_name }}{% else %}{{ ns.bot_name }}{% endif %}]{{ message['content']|replace('\n', ' ') }}{% endif %}{% endfor %}{%+ if add_generation_prompt +%}\n[{{ ns.bot_name }}]{% endif %}",
|
| 27 |
"clean_up_tokenization_spaces": true,
|
| 28 |
"do_lower_case": false,
|
| 29 |
"model_max_length": 1000000000000000019884624838656,
|