Got model running, but results are incorrect

Browse files

Files changed (4) hide show

attention.py +3 -6
config.json +2 -2
phi2_configuration.py +18 -18
phi2_model.py +1 -1

attention.py CHANGED Viewed

@@ -28,7 +28,7 @@ class RotaryEmbedding(nn.Module):
         d_rotary: int,
         rotary_base: float = 10000.0,
         initial_cos_sin_cache_len: int = 2048,
-        device: torch.device | None = None,
     ) -> None:
         super().__init__()
         self.d_rotary = d_rotary
@@ -52,7 +52,6 @@ class RotaryEmbedding(nn.Module):
                 torch.arange(
                     start=0,
                     end=self.d_rotary,
-                    step=2,
                     device=self.device,
                     dtype=self.dtype,
                 ) / self.d_rotary
@@ -61,8 +60,8 @@ class RotaryEmbedding(nn.Module):
         # torch.outer, since torch.einsum converts from fp32 to fp16 if used with torch.amp
         # TODO: does this matter if I'm disabling torch.autocast?
         m_theta_i = torch.outer(m, theta_i)
-        self._cos_cached = torch.cos(m_theta_i).to(self.dtype)
-        self._sin_cached = torch.sin(m_theta_i).to(self.dtype)
         # TODO: scale_base caching is labelled as not yet done in Phi2
         """
@@ -108,8 +107,6 @@ class RotaryEmbedding(nn.Module):
         if (
             not self._max_seqlen
             or self._max_seqlen < x.shape[1] + seqlen_offset
-            or self._cos_cached.device != x.device
-            or self._cos_cached.dtype != x.dtype
             or (self.training and self._cos_cached.is_inference())
         ):
             self._update_cos_sin_cache(seqlen=x.shape[1] + seqlen_offset)

         d_rotary: int,
         rotary_base: float = 10000.0,
         initial_cos_sin_cache_len: int = 2048,
+        device: torch.device = "cuda",
     ) -> None:
         super().__init__()
         self.d_rotary = d_rotary
                 torch.arange(
                     start=0,
                     end=self.d_rotary,
                     device=self.device,
                     dtype=self.dtype,
                 ) / self.d_rotary
         # torch.outer, since torch.einsum converts from fp32 to fp16 if used with torch.amp
         # TODO: does this matter if I'm disabling torch.autocast?
         m_theta_i = torch.outer(m, theta_i)
+        self._cos_cached = torch.cos(m_theta_i).to(self.dtype).to(self.device)
+        self._sin_cached = torch.sin(m_theta_i).to(self.dtype).to(self.device)
         # TODO: scale_base caching is labelled as not yet done in Phi2
         """
         if (
             not self._max_seqlen
             or self._max_seqlen < x.shape[1] + seqlen_offset
             or (self.training and self._cos_cached.is_inference())
         ):
             self._update_cos_sin_cache(seqlen=x.shape[1] + seqlen_offset)

config.json CHANGED Viewed

@@ -17,8 +17,8 @@
     "vocab_chunk_for_gpu_efficiency": 64,
     "initial_cos_sin_cache_len": 2048,
     "d_embedding": 2560,
-    "n_blocks": 32,
-    "n_heads": 32,
     "use_flash_attn": false,
     "use_flash_rotary": false,
     "use_fused_dense": false,

     "vocab_chunk_for_gpu_efficiency": 64,
     "initial_cos_sin_cache_len": 2048,
     "d_embedding": 2560,
+    "n_attn_blocks": 32,
+    "n_attn_heads": 32,
     "use_flash_attn": false,
     "use_flash_rotary": false,
     "use_fused_dense": false,

phi2_configuration.py CHANGED Viewed

@@ -8,27 +8,27 @@ class Phi2Config(PretrainedConfig):
         "max_position_embeddings": "initial_cos_sin_cache_len",
         "hidden_size": "d_embedding",
         "num_attention_heads": "n_attn_heads",
-        "num_hidden_layers": "n_blocks",
     }
     def __init__(
         self,
-        vocab_size: int = 50295,  # this includes the extra tokens included by Phi2 in tokenizer_config.json
-        vocab_chunk_for_gpu_efficiency: int = 64,
-        initial_cos_sin_cache_len: int = 2048,
-        d_embedding: int = 1024,  # 2560?
-        n_blocks: int = 20,  # 32?
-        n_attn_heads: int = 16,  # 32?
-        use_flash_attn: bool = False,
-        use_flash_rotary: bool = False,
-        use_fused_dense: bool = False,
-        attn_pdrop: float = 0.0,
-        embd_pdrop: float = 0.0,
-        resid_pdrop: float = 0.0,
-        layer_norm_epsilon: float = 1e-5,
-        weight_initialization_range: float = 0.02,
-        tie_word_embeddings: bool = False,  # whether embedding weights are shared between the encoder and decoder
-        checkpointing: bool = False,  # whether to use gradient checkpointing to reduce memory usage (I think)
         **kwargs
     ) -> None:
         self.vocab_size = (
@@ -38,7 +38,7 @@ class Phi2Config(PretrainedConfig):
         )
         self.initial_cos_sin_cache_len = initial_cos_sin_cache_len
         self.d_embedding = d_embedding
-        self.n_blocks = n_blocks
         self.n_attn_heads = n_attn_heads
         self.use_flash_attn = use_flash_attn
         self.use_flash_rotary = use_flash_rotary

         "max_position_embeddings": "initial_cos_sin_cache_len",
         "hidden_size": "d_embedding",
         "num_attention_heads": "n_attn_heads",
+        "num_hidden_layers": "n_attn_blocks",
     }
     def __init__(
         self,
+        vocab_size: int,  # this includes the extra tokens included by Phi2 in tokenizer_config.json
+        vocab_chunk_for_gpu_efficiency: int,
+        initial_cos_sin_cache_len: int,
+        d_embedding: int,
+        n_attn_blocks: int,
+        n_attn_heads: int,
+        use_flash_attn: bool,
+        use_flash_rotary: bool,
+        use_fused_dense: bool,
+        attn_pdrop: float,
+        embd_pdrop: float,
+        resid_pdrop: float,
+        layer_norm_epsilon: float,
+        weight_initialization_range: float,
+        tie_word_embeddings: bool,  # whether embedding weights are shared between the encoder and decoder
+        checkpointing: bool,  # whether to use gradient checkpointing to reduce memory usage (I think)
         **kwargs
     ) -> None:
         self.vocab_size = (
         )
         self.initial_cos_sin_cache_len = initial_cos_sin_cache_len
         self.d_embedding = d_embedding
+        self.n_attn_blocks = n_attn_blocks
         self.n_attn_heads = n_attn_heads
         self.use_flash_attn = use_flash_attn
         self.use_flash_rotary = use_flash_rotary

phi2_model.py CHANGED Viewed

@@ -106,7 +106,7 @@ class Phi2Model(Phi2PreTrainedModel):
                 use_fused_dense=config.use_fused_dense,
                 checkpointing=config.checkpointing,
             )
-            for i in range(config.n_blocks)
         ])
         self.gradient_checkpointing_disable()  # https://github.com/cybertronai/gradient-checkpointing - I think this is turned off due to flash attention?
         self.post_init()  # calls self._init_weights() for all modules

                 use_fused_dense=config.use_fused_dense,
                 checkpointing=config.checkpointing,
             )
+            for i in range(config.n_attn_blocks)
         ])
         self.gradient_checkpointing_disable()  # https://github.com/cybertronai/gradient-checkpointing - I think this is turned off due to flash attention?
         self.post_init()  # calls self._init_weights() for all modules