add chat_template (#4)

Browse files

- add chat_template (373b65c403f3cfc50e2d8f0a3b0b0353078cd005)

Co-authored-by: one dozon <[email protected]>

Files changed (3) hide show

README.md +1 -1
tokenization_chatglm.py +32 -7
tokenizer_config.json +19 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 >>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
->>> model = AutoModel.from_pretrained("LingxinAI/CharacterGLM-6b", trust_remote_code=True, device='cuda')
 >>> model = model.eval()
 >>> session_meta = {'user_info': '我是陆星辰，是一个男性，是一位知名导演，也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的，并视我为良师益友。', 'bot_info': '苏梦远，本名苏远心，是一位当红的国内女歌手及演员。在参加选秀节目后，凭借独特的嗓音及出众的舞台魅力迅速成名，进入娱乐圈。她外表美丽动人，但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生，善于创作，拥有多首热门原创歌曲。除了音乐方面的成就，她还热衷于慈善事业，积极参加公益活动，用实际行动传递正能量。在工作中，她对待工作非常敬业，拍戏时总是全身心投入角色，赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈，但她始终保持低调、谦逊的态度，深得同行尊重。在表达时，苏梦远喜欢使用“我们”和“一起”，强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
 >>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])

 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 >>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
+>>> model = AutoModel.from_pretrained("thu-coai/CharacterGLM-6b", trust_remote_code=True, device='cuda')
 >>> model = model.eval()
 >>> session_meta = {'user_info': '我是陆星辰，是一个男性，是一位知名导演，也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的，并视我为良师益友。', 'bot_info': '苏梦远，本名苏远心，是一位当红的国内女歌手及演员。在参加选秀节目后，凭借独特的嗓音及出众的舞台魅力迅速成名，进入娱乐圈。她外表美丽动人，但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生，善于创作，拥有多首热门原创歌曲。除了音乐方面的成就，她还热衷于慈善事业，积极参加公益活动，用实际行动传递正能量。在工作中，她对待工作非常敬业，拍戏时总是全身心投入角色，赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈，但她始终保持低调、谦逊的态度，深得同行尊重。在表达时，苏梦远喜欢使用“我们”和“一起”，强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
 >>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])

tokenization_chatglm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import torch
 from typing import List, Optional, Union, Dict
 from sentencepiece import SentencePieceProcessor
 from transformers import PreTrainedTokenizer
@@ -27,9 +27,22 @@ class SPTokenizer:
             self.special_tokens[token] = self.n_words
             self.index_special_tokens[self.n_words] = token
             self.n_words += 1
-    def tokenize(self, s: str):
-        return self.sp_model.EncodeAsPieces(s)
     def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
         assert type(s) is str
@@ -41,7 +54,18 @@ class SPTokenizer:
         return t
     def decode(self, t: List[int]) -> str:
-        return self.sp_model.decode(t)
     def decode_tokens(self, tokens: List[str]) -> str:
         text = self.sp_model.DecodePieces(tokens)
@@ -65,7 +89,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
     model_input_names = ["input_ids", "attention_mask", "position_ids"]
-    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
         self.name = "GLMTokenizer"
         self.vocab_file = vocab_file
@@ -75,6 +99,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
             "<eos>": self.tokenizer.eos_id,
             "<pad>": self.tokenizer.pad_id
         }
         super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
     def get_command(self, token):
@@ -110,7 +135,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         return vocab
     def _tokenize(self, text, **kwargs):
-        return self.tokenizer.tokenize(text)
     def _convert_token_to_id(self, token):
         """ Converts a token (str) in an id using the vocab. """

 import os
+import re
 from typing import List, Optional, Union, Dict
 from sentencepiece import SentencePieceProcessor
 from transformers import PreTrainedTokenizer
             self.special_tokens[token] = self.n_words
             self.index_special_tokens[self.n_words] = token
             self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
     def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
         assert type(s) is str
         return t
     def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
     def decode_tokens(self, tokens: List[str]) -> str:
         text = self.sp_model.DecodePieces(tokens)
     model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, **kwargs):
         self.name = "GLMTokenizer"
         self.vocab_file = vocab_file
             "<eos>": self.tokenizer.eos_id,
             "<pad>": self.tokenizer.pad_id
         }
+        self.encode_special_tokens = encode_special_tokens
         super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
     def get_command(self, token):
         return vocab
     def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
     def _convert_token_to_id(self, token):
         """ Converts a token (str) in an id using the vocab. """

tokenizer_config.json CHANGED Viewed

@@ -1,10 +1,29 @@
 {
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_chatglm.ChatGLMTokenizer",
       null
     ]
   },
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "model_max_length": 1000000000000000019884624838656,

 {
+  "added_tokens_decoder": {
+    "64790": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64792": {
+      "content": "sop",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_chatglm.ChatGLMTokenizer",
       null
     ]
   },
+  "chat_template": "{% set ns = namespace() %}[gMASK]sop{% for message in messages %}{% if loop.first %}{% set ns.bot_name = message['bot_name'] %}{% set ns.user_name = message['user_name'] %}以下是一段{{ message['bot_name'] }}和{{ message['user_name'] }}之间的对话。{%+ if message['bot_profile'] is defined and message['bot_profile']|length +%}\n关于{{ message['bot_name'] }}的信息：{{ message['bot_profile']|replace('\n', ' ') }}{% endif %}{%+ if message['user_profile'] is defined and message['user_profile']|length +%}\n关于{{ message['user_name'] }}的信息：{{ message['user_profile']|replace('\n', ' ') }}{% endif %}{%+ else +%}\n[{% if message['role'] == 'user' %}{{ ns.user_name }}{% else %}{{ ns.bot_name }}{% endif %}]{{ message['content']|replace('\n', ' ') }}{% endif %}{% endfor %}{%+ if add_generation_prompt +%}\n[{{ ns.bot_name }}]{% endif %}",
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "model_max_length": 1000000000000000019884624838656,