# coding=utf-8
# Copyright 2025 Fromthesky Research Labs, LLC. All rights reserved.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code uses the Llama model implementation by Eleuther AI 
# and Huggingface teams in this library as a starting point and implements 
# the PLDR-LLM (Large Language Model from Power Law Decoder Representations)
#  architecture based on its implementation by the Fromthesky Research Labs team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Callable, Optional, Union

import torch
from torch import nn
import torch.nn.functional as F

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.masking_utils import create_causal_mask
from transformers.modeling_layers import GradientCheckpointingLayer

from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from .configuration_pldrllm import PldrllmConfig

from dataclasses import dataclass
from transformers.utils import ModelOutput

logger = logging.get_logger(__name__)

################## PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ########################################

''''
Power law attention implementation for PLDR-LLM with KV-cache and G-cache.
'''

class PlgaLayer(nn.Module):
    '''
    Power law graph attention layer implementation.
    '''
    def __init__(self, config:PldrllmConfig, 
                 F_hidden:int, 
                 F_heads:int, 
                 layer_idx:int,  
                 device=None, 
                 **kwargs)->None:
        '''
        Args:
            F_hidden: hidden layer shape used in layer weight creation. For multi-head plga this is head_dim.
            F_heads: Number of attention heads.
            layer_idx: index for the decoder layer.
            device: device(cpu or gpu) to load tensors.
        '''

        super().__init__(**kwargs)
        self.F_hidden=F_hidden
        self.F_heads=F_heads
        self.layer_idx=layer_idx
        self.device=device
        self.config=config
        self.is_causal = True
        self.custom_G_type=config.custom_G_type
        self.attention_dropout=config.attention_dropout

        # default type is set as config.torch_dtype
        self.wdtype=None 

        if self.custom_G_type is None:
            self.build_weights()
        else:
            self.Wlst = None
            self.blst = None
            self.pwlst = None 
            self.alst = None
            self.balst = None


    def cg_align_one(self, Hin:torch.Tensor, 
                     Hk:torch.Tensor, 
                     Hv:torch.Tensor, 
                     A:torch.Tensor, 
                     a_vec:Optional[torch.Tensor], 
                     ba:Optional[torch.Tensor],
                     W:Optional[torch.Tensor],
                     b:Optional[torch.Tensor], 
                     pw:Optional[torch.Tensor],
                     past_G_values: Optional[torch.Tensor],
                     past_G_values_status: Optional[torch.BoolTensor]=None,
                     mask:Optional[torch.Tensor]=None,
                     use_cache: Optional[bool]=None,
                     **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
        '''
        Alignment model for calculating attention weights
        Args:
            Hin: query
            Hk: key
            A: metric tensor instance
            a_vec: learned coupling coefficients.
            ba: bias for coupling coeffients
            W: weights applied on metric tensor before AdjActivation
            b: bias applied on metric tensor before AdjActivation
            pw: learned power exponents applied on metric tensor
            mask: padding or lookahead mask
        Returns:
            Hout: Attention output.
            A tuple of:
                A: metric tensor as output of residual metric learner layer, A
                AW: metric tensor after AdjActivation is applied, A_LM
                pw: learned power exponents
                a_vec: learned coupling coefficients for energy-curvature tensor
                ba: bias for energy-curvature tensor
                avAp: Energy curvature tensor, G_LM
                E: attention weights
        '''

        if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):

            AdjActivation=iSwiGLU
            epsilonAdj=1e-9

            # make metric tensor positive definite
            AW=AdjActivation(torch.matmul(W,A)+b)+epsilonAdj

            # find energy curvature tensor and attention weights
            Ap=torch.pow(AW, pw)
            avAp=torch.matmul(a_vec, Ap)+ba # [batch_size, num_head,  depth, depth]

            if use_cache:
                # update only once if cache is enabled.
                G_batch_size=past_G_values.size()[2]
                past_G_values[self.layer_idx]=torch.stack([A[:G_batch_size,:,:,:], 
                                                            AW[:G_batch_size,:,:,:], 
                                                            avAp[:G_batch_size,:,:,:]], dim=0) # [3, batch_size, num_head,  depth, depth]
                past_G_values_status[self.layer_idx]=True
        else:
            AW=past_G_values[self.layer_idx, 1]
            avAp=past_G_values[self.layer_idx, 2]

        WHiWHj = torch.matmul(Hin, avAp) # [batch_size, num_head, seq_lenq, depth]

        # scale attention with square root of depth
        dk=torch.tensor(self.F_hidden).to(Hin.dtype)
        scaling=1/torch.sqrt(dk)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        query, key, value = WHiWHj.to(dtype=Hk.dtype), Hk, Hv

        Hout, E = attention_interface(
            self,
            query=query,
            key=key,
            value=value,
            attention_mask=mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=scaling,
            **kwargs
        )

        return Hout, (A, AW, pw, a_vec, ba, avAp, E)
    
    def cg_align_head(self, Hin:torch.Tensor, 
                      Hk:torch.Tensor, 
                      Hv:torch.Tensor, 
                      A:torch.Tensor, 
                      mask:Optional[torch.Tensor]=None,
                      past_G_values: Optional[torch.Tensor]=None,
                      past_G_values_status: Optional[torch.BoolTensor]=None,
                      use_cache: Optional[bool]=None,
                      **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
        '''
        Method for linear propagation of attention weights over values.
        '''

        Hout, att_weights=self.cg_align_one(Hin=Hin, Hk=Hk, Hv=Hv, A=A, 
                                            a_vec=self.alst,
                                            ba=self.balst,
                                            W=self.Wlst,
                                            b=self.blst,
                                            pw=self.pwlst, 
                                            mask=mask,
                                            past_G_values=past_G_values,
                                            past_G_values_status=past_G_values_status,
                                            use_cache=use_cache,
                                            **kwargs)

        return Hout, att_weights


    def build_weights(self)->None:
        '''
        Used to initialize learnable parameters for the layer:
        W: weights to apply on metric tensor.
        b: bias to apply on metric tensor.
        a: coupling coefficients for energy-curvature (G) tensor.
        ba: bias for energy-curvature tensor.
        pw: power exponent weights for potential tensor.
        '''

        weight_shape=[self.F_heads, self.F_hidden, self.F_hidden] # [num_heads, depth, depth]

        add_weight_Wpart= torch.empty(weight_shape, dtype=self.wdtype, device=self.device) 
        add_weight_bpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device) 
        add_weight_pwpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
        add_weight_apart = torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
        add_weight_bapart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)

        self.Wlst = nn.Parameter(add_weight_Wpart, requires_grad=True)
        self.blst = nn.Parameter(add_weight_bpart, requires_grad=True) 
        self.pwlst = nn.Parameter(add_weight_pwpart, requires_grad=True)  
        self.alst = nn.Parameter(add_weight_apart, requires_grad=True) 
        self.balst = nn.Parameter(add_weight_bapart, requires_grad=True) 


    def forward(self, inputs:tuple[torch.Tensor,...],
                past_G_values: Optional[torch.Tensor]=None,
                past_G_values_status: Optional[torch.BoolTensor]=None,
                use_cache:Optional[bool]=False, 
                **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
        '''
        execute the forward propagation
        inputs[0] = query = Hin
        inputs[1] = key = Hk
        inputs[2] = value = Hv
        inputs[3] = metric tensor = A
        inputs[4] = mask
        '''

        Hin, Hk, Hv, A, mask=inputs
        H_next, att_weights = self.cg_align_head(Hin=Hin, Hk=Hk, Hv=Hv, A=A, mask=mask,
                                                 past_G_values=past_G_values,
                                                 past_G_values_status=past_G_values_status,
                                                 use_cache=use_cache, **kwargs)
        return H_next, att_weights

def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs:Unpack[TransformersKwargs],
    )->tuple[torch.Tensor, torch.Tensor]:

    keyt=torch.permute(key, [0, 1, 3, 2])  # [batch_size, num_head, depth, seq_lenk]
    attn_weights = torch.matmul(query, keyt) * scaling # [batch_size, num_head, seq_lenq, seq_lenk]
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value)
    attn_output = torch.permute(attn_output, [0, 2, 1, 3])
    attn_output = attn_output.contiguous()

    return attn_output, attn_weights

def iSwiGLU(x):
    '''SwiGLU activation function with weights W,V equal to identity matrix and no bias.'''
    gate=F.silu(x)
    out=torch.mul(x, gate)
    return out

################################### END OF PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ############################################

#################################### PLDR-LLM MODEL IMPLEMENTATION ################################################################

'''
Model Implementation for Large Language Model from Power Law Decoder Representations with KV-cache and G-cache.
'''

class PldrllmAttention(nn.Module):
    '''
    Power Law Multihead Attention Implementation for PLDR-LLM.
    '''
    def __init__(self,config: PldrllmConfig, 
                 layer_idx:int, 
                 device=None, 
                 **kwargs)->None:


        super().__init__(**kwargs)
        self.num_heads = config.num_attention_heads 
        self.d_model = config.hidden_size 
        self.A_dff = config.A_dff
        self.num_denseA = config.num_denseA
        self.num_reslayerA = config.num_reslayerA
        self.activation=ACT2FN[config.hidden_act]
        self.max_seq_len=config.max_position_embeddings
        self.layer_idx=layer_idx
        self.device=device
        self.attention_bias=config.attention_bias
        self.custom_G_type=config.custom_G_type
        self.layer_norm_eps=config.layer_norm_eps
        self.glu_bias=config.glu_bias
        self.reference_rope=config.reference_rope
        self.wdtype=None

        assert self.d_model % self.num_heads == 0
        self.depth = config.head_dim

        self.wq = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
        self.wk = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
        self.wv = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)

        self.plgatt_layer= PlgaLayer(config=config,
                                      F_hidden=self.depth,
                                      F_heads= self.num_heads,
                                      layer_idx=self.layer_idx,
                                      device=self.device)

        self.dense = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)

        if self.custom_G_type is None:
            # residual layers for metric tensor learning
            self.reslayerAs=nn.ModuleList([ResLayerA(depth=self.depth, 
                                                    A_dff=self.A_dff,
                                                    num_denseA=self.num_denseA,
                                                    layer_norm_eps=self.layer_norm_eps,
                                                    glu_bias=self.glu_bias,
                                                    activation=self.activation,
                                                    device=self.device,
                                                    dtype=self.wdtype) for _ in range(self.num_reslayerA)])
            
            self.layernorm1 = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)
        
        if self.reference_rope:
            # keep initialization and forward in same module for reference rope implementation
            self.rotary_embedding=RotaryPositionalEmbeddings(dim=self.depth, 
                                                             max_seq_len=self.max_seq_len, 
                                                             base=config.rope_theta
                                                             ).to(device=self.device, dtype=self.wdtype)

        
    def split_heads(self, x, batch_size):
        '''
        Split the last dimension into (num_heads, depth).
        '''
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x # [batch_size, seq_len, num_heads, depth]

    def forward(self, inputs:tuple[torch.Tensor, ...],
                position_embeddings:torch.Tensor,
                position_ids: Optional[torch.LongTensor]=None,
                cache_position:Optional[torch.LongTensor]=None,
                past_G_values: Optional[torch.Tensor]=None,
                past_G_values_status: Optional[torch.BoolTensor]=None,
                past_key_values: Optional[Cache]=None,
                use_cache:Optional[bool]=None,
                **kwargs: Unpack[TransformersKwargs]               
                )->tuple[torch.Tensor, tuple[torch.Tensor,...]]:

        q, k, v, mask = inputs
        batch_size = q.size()[0]

        q = self.wq(q)  # [batch_size, seq_len, d_model]
        k = self.wk(k)
        v = self.wv(v)


        q = self.split_heads(q, batch_size)  # [batch_size, seq_len, num_heads, depth]
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)


        if position_embeddings is not None:
            cos, sin = position_embeddings
            q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, unsqueeze_dim=2)
        else:
            q=self.rotary_embedding(q, input_pos=position_ids)
            k=self.rotary_embedding(k, input_pos=position_ids)
        
        q = torch.permute(q, [0, 2, 1, 3]) # [batch_size, num_heads, seq_len, depth]
        k = torch.permute(k, [0, 2, 1, 3]) 
        v = torch.permute(v, [0, 2, 1, 3]) 
        
        if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):
            # Calculate density matrix using linear self attention           
            qt = torch.permute(q, [0, 1, 3, 2])
            A = torch.matmul(qt, q)  # [batch_size, num_head, depth, depth]
            A=self.layernorm1(A)

            #Deep residual network for learning metric tensor
            for i in range(self.num_reslayerA):
                A=self.reslayerAs[i]([A])
        else:
            A=past_G_values[self.layer_idx,0] # [1, num_head, depth, depth]
        
        if use_cache:
            #cache position for static cache
            cache_kwargs = {"cache_position": cache_position}
            k, v = past_key_values.update(key_states=k, value_states=v, layer_idx=self.layer_idx, cache_kwargs=cache_kwargs)

        #Apply multi-head power law attention
        Hnext, att_weights = self.plgatt_layer((q, k, v, A, mask),
                                               past_G_values,
                                               past_G_values_status,
                                               use_cache, **kwargs)

        Hnext= Hnext.reshape(batch_size, -1, self.d_model) # [batch_size, seq_len, d_model]

        output = self.dense(Hnext)

        return output, att_weights


class PLDR_DecoderLayer(GradientCheckpointingLayer):
    '''
    Single decoder layer implementation for PLDR-LLM with single masked multihead attention.
    '''
    def __init__(self, config: PldrllmConfig, 
                 layer_idx:int, 
                 device=None, 
                 **kwargs)->None:

        super().__init__(**kwargs)

        self.d_model=config.hidden_size 
        self.num_heads=config.num_attention_heads
        self.dff=config.intermediate_size
        self.A_dff=config.A_dff
        self.num_denseA = config.num_denseA
        self.num_reslayerA = config.num_reslayerA
        self.activation=ACT2FN[config.hidden_act]
        self.max_seq_len=config.max_position_embeddings
        self.layer_idx=layer_idx
        self.device=device
        self.layer_norm_eps=config.layer_norm_eps
        self.glu_bias=config.glu_bias
        self.wdtype=None

        self.mha1 = PldrllmAttention(config=config, layer_idx=layer_idx, device=self.device)

        self.ffn = self.dec_point_wise_feed_forward_network()

        self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)
        self.layernorm2 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps,  device=self.device, dtype=self.wdtype)

    def forward(self, 
                hidden_states:torch.Tensor,
                look_ahead_mask:torch.Tensor,
                position_embeddings:torch.Tensor,
                position_ids:Optional[torch.LongTensor]=None,
                cache_position:Optional[torch.LongTensor]=None,
                use_cache:Optional[bool]=None,
                past_key_values:Optional[Cache]=None,
                past_G_values:Optional[torch.Tensor]=None,
                past_G_values_status:Optional[list[bool]]=None,
                **kwargs:Unpack[TransformersKwargs]
                )->tuple[torch.Tensor, tuple[torch.Tensor,...]]:

        attn1, att_weights = self.mha1(inputs=[hidden_states, hidden_states, hidden_states, look_ahead_mask],
                                        position_embeddings=position_embeddings,
                                        position_ids=position_ids,
                                        cache_position=cache_position,
                                        past_key_values=past_key_values,
                                        past_G_values=past_G_values,
                                        past_G_values_status=past_G_values_status,
                                        use_cache=use_cache,
                                        **kwargs
                                        )
        out1 = self.layernorm1(attn1 + hidden_states)

        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(ffn_output + out1)  # [batch_size, target_seq_len, d_model]

        return out2, att_weights


    # GLUVariant implementation for feedforward network, scale dff accordingly (i.e., 2/3 of original).
    def dec_point_wise_feed_forward_network(self):
        return GLUVariant(self.d_model, self.dff, self.d_model, 
                          glu_bias=self.glu_bias,
                          activation=self.activation, 
                          device=self.device,
                          dtype=self.wdtype)


class ResLayerA(nn.Module):
    '''
    Residual Layer implementation for metric learner of PLDR-LLM
    '''
    def __init__(self, depth:int,
                 A_dff:int,
                 num_denseA:int, 
                 layer_norm_eps:float, 
                 glu_bias:bool,
                 activation:Callable=F.silu,  
                 device=None,
                 dtype=None,  
                 **kwargs)->None:
        super().__init__(**kwargs)
        self.depth=depth
        self.A_dff = A_dff
        self.num_denseA = num_denseA
        self.activation=activation
        self.device=device
        self.layer_norm_eps=layer_norm_eps
        self.glu_bias=glu_bias

        self.denseAs = nn.ModuleList([GLUVariant(self.depth, self.A_dff, self.depth,
                                                 glu_bias=self.glu_bias,
                                                 activation=self.activation, 
                                                 device=self.device,
                                                 dtype=dtype) for _ in range(self.num_denseA)])

        self.layernormA = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=dtype)
        self.identity=nn.Identity()
    
    def ResUnit(self, A:torch.Tensor)->torch.Tensor:
        Ain = self.identity(A)
        for i in range(self.num_denseA):
            A = self.denseAs[i](A)
        A = self.layernormA(A + Ain)
        return A

    def forward(self, inputs:list[torch.Tensor], **kwargs)->torch.Tensor:
        A=inputs[0]
        return self.ResUnit(A)


class GLUVariant(nn.Module):
    '''
    Implementation of GLU variants with default activation for SwiGLU configuration 
    For the hidden layer dff, to match size with non-SwiGLU FFN version scaling with 2/3 may be useful.
    '''
    def __init__(self, d_model:int, 
                 dff:int, 
                 depth:int, 
                 glu_bias:bool,
                 activation:Callable=F.silu, 
                 device=None,
                 dtype=None,
                 **kwargs)->None:
        super().__init__(**kwargs)
        self.dff=dff
        self.depth=depth
        self.d_model=d_model
        self.activation=activation
        self.device=device
        self.glu_bias=glu_bias

        self.gluw1=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
        self.gluw2=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
        self.gluw3=nn.Linear(self.dff, self.depth, bias=self.glu_bias, device=self.device, dtype=dtype)

    def forward(self, input:torch.Tensor, **kwargs)->torch.Tensor:
        x1=self.gluw1(input)
        x1=self.activation(x1)
        x2=self.gluw2(input)
        return self.gluw3(torch.mul(x1, x2))


###################################### END OF PLDRLLM MODEL IMPLEMENTATION #####################################################


# RotaryPositionalEmbeddings is from https://github.com/pytorch/torchtune/blob/main/torchtune/modules/position_embeddings.py
# This implementation was  used in the original pytorch based implementation of PLDR-LLM.
class RotaryPositionalEmbeddings(nn.Module):
    """
    This class implements Rotary Positional Embeddings (RoPE)
    proposed in https://arxiv.org/abs/2104.09864.

    Reference implementation (used for correctness verfication)
    can be found here:
    https://github.com/meta-llama/llama/blob/main/llama/model.py#L80

    In this implementation we cache the embeddings for each position upto
    ``max_seq_len`` by computing this during init.

    Args:
        dim (int): Embedding dimension. This is usually set to the dim of each
            head in the attention module computed as ``embed_dim // num_heads``
        max_seq_len (int): Maximum expected sequence length for the
            model, if exceeded the cached freqs will be recomputed
        base (int): The base for the geometric progression used to compute
            the rotation angles
    """

    def __init__(
        self,
        dim: int,
        max_seq_len: int = 4096,
        base: int = 10_000,
    ) -> None:
        super().__init__()
        self.dim = dim
        self.base = base
        self.max_seq_len = max_seq_len
        self.rope_init()

    def rope_init(self):
        theta = 1.0 / (
            self.base
            ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim)
        )
        self.register_buffer("theta", theta, persistent=False)
        self.build_rope_cache(self.max_seq_len)

    def build_rope_cache(self, max_seq_len: int = 4096) -> None:
        # Create position indexes `[0, 1, ..., max_seq_len - 1]`
        seq_idx = torch.arange(
            max_seq_len, dtype=self.theta.dtype, device=self.theta.device
        )

        # Outer product of theta and position index; output tensor has
        # a shape of [max_seq_len, dim // 2]
        idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float()

        # cache includes both the cos and sin components and so the output shape is
        # [max_seq_len, dim // 2, 2]
        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
        self.register_buffer("cache", cache, persistent=False)

    def forward(
        self, x: torch.Tensor, *, input_pos: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): input tensor with shape
                ``[b, s, n_h, h_d]``
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b, s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Returns:
            torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]``

        Notation used for tensor shapes:
            - b: batch size
            - s: sequence length
            - n_h: num heads
            - h_d: head dim
        """
        # input tensor has shape [b, s, n_h, h_d]
        seq_len = x.size(1)

        # extract the values based on whether input_pos is set or not
        rope_cache = (
            self.cache[:seq_len] if input_pos is None else self.cache[input_pos]
        )

        # reshape input; the last dimension is used for computing the output.
        # Cast to float to match the reference implementation
        # tensor has shape [b, s, n_h, h_d // 2, 2]
        xshaped = x.float().reshape(*x.shape[:-1], -1, 2)

        # reshape the cache for broadcasting
        # tensor has shape [b, s, 1, h_d // 2, 2] if packed samples,
        # otherwise has shape [1, s, 1, h_d // 2, 2]
        rope_cache = rope_cache.view(-1, xshaped.size(1), 1, xshaped.size(3), 2)

        # tensor has shape [b, s, n_h, h_d // 2, 2]
        x_out = torch.stack(
            [
                xshaped[..., 0] * rope_cache[..., 0]
                - xshaped[..., 1] * rope_cache[..., 1],
                xshaped[..., 1] * rope_cache[..., 0]
                + xshaped[..., 0] * rope_cache[..., 1],
            ],
            -1,
        )

        # tensor has shape [b, s, n_h, h_d]
        x_out = x_out.flatten(3)
        return x_out.type_as(x)


class PldrllmRotaryEmbedding(nn.Module):
    def __init__(self, config: PldrllmConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

############# END OF ROTARY EMBEDDING IMPLEMENTATION #################################################

@dataclass
class BasePLDRModelOutputWithPast(ModelOutput):
    """
    Base class for [`PldrllmModel`] outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

            The tuple for each layer contains:
            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    """
    last_hidden_state: Optional[torch.Tensor] = None
    past_key_values: Optional[Cache] = None
    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
    attentions: Optional[tuple[torch.Tensor, ...]] = None
    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]]  = None

@dataclass
class CausalPLDRLLMOutputWithPast(ModelOutput):
    """
    Base class for [`PldrllmForCausalLM`] causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

            The tuple for each layer contains:
            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    """
    loss: Optional[torch.Tensor] = None
    logits: Optional[torch.Tensor] = None
    past_key_values: Optional[Cache] = None
    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
    attentions: Optional[tuple[torch.Tensor, ...]] = None
    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

@dataclass
class TokenClassifierPLDRLLMOutput(ModelOutput):
    """
    Base class for outputs of [`PldrllmForTokenClassification`] token classification model.

    Args:
        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
        logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

            The tuple for each layer contains:
            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    """
    loss: Optional[torch.Tensor] = None
    logits: Optional[torch.Tensor] = None
    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
    attentions: Optional[tuple[torch.Tensor, ...]] = None
    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

@dataclass
class QuestionAnsweringPLDRModelOutput(ModelOutput):
    """
    Base class for outputs of [`PldrllmForQuestionAnswering`] question answering model.

    Args:
        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

            The tuple for each layer contains:
            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    """

    loss: Optional[torch.Tensor] = None
    start_logits: Optional[torch.Tensor] = None
    end_logits: Optional[torch.Tensor] = None
    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
    attentions: Optional[tuple[torch.Tensor, ...]] = None
    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

@dataclass
class SequenceClassifierPLDRLLMOutputWithPast(ModelOutput):
    """
    Base class for outputs of [`PldrllmForSequenceClassification`] sentence classification model.

    Args:
        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

            The tuple for each layer contains:
            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, 
            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
    """

    loss: Optional[torch.Tensor] = None
    logits: Optional[torch.Tensor] = None
    past_key_values: Optional[Cache] = None
    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
    attentions: Optional[tuple[torch.Tensor, ...]] = None
    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None


@auto_docstring
class PldrllmPreTrainedModel(PreTrainedModel):
    config_class = PldrllmConfig
    base_model_prefix = "decoder"
    supports_gradient_checkpointing = True
    _no_split_modules = ["PLDR_DecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn = True
    _supports_sdpa = True
    _supports_flex_attn = False
    _supports_attention_backend = True
    _can_compile_fullgraph=False
    
    def __init__(self, config: PldrllmConfig)->None:
        super().__init__(config)
        self.custom_G_type=config.custom_G_type
        if self.custom_G_type is not None:
            self._can_compile_fullgraph=True

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight.data)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=1.0)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.weight.data.fill_(1.0)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, PlgaLayer):
            if module.Wlst is not None:
                nn.init.xavier_uniform_(module.Wlst.data)
            if module.pwlst is not None:
                nn.init.xavier_uniform_(module.pwlst.data)
            if module.alst is not None:
                nn.init.xavier_uniform_(module.alst.data)
            if module.blst is not None:
                module.blst.data.zero_()
            if module.balst is not None:
                module.balst.data.zero_()

MODEL_COMMON_CUSTOM_ARGS=r"""
        output_pldr_attentions (`bool`, *optional*, defaults to `False`):
            Whether to return the deductive outputs and learnable parameters of power law graph attention module as tuple containing:
            the output of the residual metric learner (metric tensor, A), output (A_LM) after application of iSwiGLU on metric tensor, learned 
            exponents of potential tensor, learned weights for energy-curvature tensor, learned bias for
            energy-curvature tensor, energy-curvature tensor (G_LM), and attention weights.
        cache_first_G (`bool`, *optional*, defaults to `False`):
            Whether or not the model should return the G values from first sample in a batch or G values from all samples for past_G_values initialization. 
            When `cache_first_G=true`, the batch_size of past_G_values is 1. This argument should be set to True for contrastive text generation 
            with learned G values.
        """


@auto_docstring(custom_intro="""
                Large Language Model From Power Law Decoder Representations (PLDR-LLM) with decoder hidden state as output.
                PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
                For details of model architecture, check out these papers:
                [Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
                """
                )
class PldrllmModel(PldrllmPreTrainedModel):
    def __init__(self, config: PldrllmConfig)->None:
        super().__init__(config)

        # Initialize weights and apply final processing
        self.num_layers = config.num_hidden_layers
        self.d_model=config.hidden_size
        self.num_heads=config.num_attention_heads
        self.target_vocab_size =config.vocab_size
        self.max_seq_len=config.max_position_embeddings
        self.reference_rope=config.reference_rope
        self.pldr_device=None
        self.gradient_checkpointing = False
        self.layer_norm_eps=config.layer_norm_eps
        self.wdtype=None

        assert self.d_model % self.num_heads == 0
        self.depth = config.head_dim

        self.custom_G_type=config.custom_G_type

        if self.custom_G_type is not None:
            # predefined past_G_values are initialized for both training and inference
            past_G_values, past_G_values_status=self.G_values_init(device=self.pldr_device, dtype=self.wdtype)
            self.register_buffer("past_G_values_status", past_G_values_status, persistent=True)
            self.register_buffer("past_G_values", past_G_values, persistent=True)

            logger.warning("\nIMPORTANT: decoder.past_G_values are set to predefined values and deep PLGA layers will be skipped. "
                           "Set config.custom_G_type=None to enable deep PLGA layers.")
            if self.custom_G_type=="external":
                logger.warning("\nIMPORTANT: config.custom_G_type is selected as 'external' and an external value of decoder.past_G_values[:,2,...] is expected. "
                               "decoder.past_G_values[:,2,...] are initialized to identity tensor by default. This is equivalent to an LLM with SDPA. To provide external values "
                               "to the decoder.past_G_values, either load these values along with the pretrained model or set decoder.past_G_values to a torch.float tensor of " 
                               "size (num_layers, 3, 1, num_heads, head_dim, head_dim) after model is initialized.\n")
        else:
            # learned past_G_values is initialized at inference.
            self.register_buffer("past_G_values_status", None, persistent=False)
            self.register_buffer("past_G_values", None, persistent=False)
            self.is_past_G_values_initialized=False


        self.embedding = nn.Embedding(self.target_vocab_size, self.d_model, device=self.pldr_device, dtype=self.wdtype)

        self.dec_layers = nn.ModuleList([PLDR_DecoderLayer(config,
                                                           layer_idx=i,
                                                           device=self.pldr_device) for i in range(self.num_layers)])

        self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.pldr_device, dtype=self.wdtype)

        if not self.reference_rope:
            self.rotary_embedding=PldrllmRotaryEmbedding(config=config)

        self.post_init()

    def G_values_init(self, batch_size=1, device=None, dtype=None):
        G_values_dim=(self.num_layers, 1, self.num_heads, self.depth, self.depth) # [num_layers, 1, num_heads, depth, depth]
        zeros_tensor=torch.zeros(G_values_dim, device=device, dtype=dtype) 
        identity_tensor=torch.eye(self.depth).repeat(self.num_layers, 1, self.num_heads, 1, 1).to(device=device, dtype=dtype)
        random_tensor=torch.randn(G_values_dim, device=device, dtype=dtype)     
        CUSTOM_G_VALUES={
                         'identity':torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1), # [num_layers, 3, num_heads, depth, depth]
                         'random': torch.stack([zeros_tensor, zeros_tensor, random_tensor], dim=1),
                         'external': torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1)
                         }

        if self.custom_G_type is None:
            # 3 tensors for A, AW and avAp per layer
            past_G_values = torch.zeros((self.num_layers, 3, batch_size, self.num_heads, self.depth, self.depth), device=device, dtype=dtype) 
            past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=device)
        elif self.custom_G_type in ['identity', 'random', 'external']:
            past_G_values=CUSTOM_G_VALUES[self.custom_G_type]
            past_G_values_status=torch.tensor([True]*self.num_layers,  dtype=torch.bool, device=device)
        else:
            raise ValueError("Invalid custom_G_type value. Available values are "
                             "None, 'identity', 'random', and 'external'.")
        
        self.is_past_G_values_initialized=True
        return past_G_values, past_G_values_status

    @can_return_tuple
    @auto_docstring(
        custom_args=MODEL_COMMON_CUSTOM_ARGS
    )
    def forward(self,
                input_ids: Optional[torch.LongTensor] = None,
                attention_mask: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.LongTensor] = None,
                past_key_values: Optional[Cache]=None,
                inputs_embeds: Optional[torch.FloatTensor] = None,
                use_cache: Optional[bool] = None,
                output_attentions: Optional[bool] = None,
                output_pldr_attentions: Optional[bool] = None,
                output_hidden_states: Optional[bool] = None,
                cache_position: Optional[torch.LongTensor] = None,
                cache_first_G: Optional[bool] = None,
                **kwargs: Unpack[TransformersKwargs]
                        ):

        use_cache=use_cache if use_cache is not None else self.config.use_cache
        cache_first_G=cache_first_G if cache_first_G is not None else self.config.cache_first_G
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_pldr_attentions=output_pldr_attentions if output_pldr_attentions is not None else self.config.output_pldr_attentions
        output_hidden_states=output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

        if (self.gradient_checkpointing or self.training) and use_cache:
            logger.warning_once(
                "During training, setting `use_cache=False`. Additionally, `use_cache=True` is incompatible with gradient checkpointing."
            )
            use_cache = False

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        inputs_embeds = self.embedding(input_ids) if inputs_embeds is None else inputs_embeds  # [batch_size, target_seq_len, d_model]

        dec_att_weights=() if output_pldr_attentions else None
        dec_attentions=() if output_attentions else None
 
        dec_outputs=(inputs_embeds,) if output_hidden_states else None

        if not isinstance(past_key_values, (type(None), Cache)):
            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
        
        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        # reset past_G_Values_status if they are not custom and predefined.
        if use_cache and self.custom_G_type is None and not isinstance(past_key_values, StaticCache) and  past_key_values.get_seq_length()==0:
            self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
            self.is_past_G_values_initialized=False
        
        if use_cache and isinstance(past_key_values, StaticCache) and ((self.custom_G_type is None) or
                                                                         "flash_attention" in self.config._attn_implementation):
            raise ValueError("Static Cache is only supported with predefined past_G_values. "
                             "Flash attention is not supported. "
                             "Supported models are with config.custom_G_type set to 'random', 'identity' or 'external'.")
        
        if not self.is_past_G_values_initialized  and self.custom_G_type is None:
            if use_cache:
                batch_size=1 if cache_first_G else inputs_embeds.size()[0]
                self.past_G_values, self.past_G_values_status=self.G_values_init(batch_size=batch_size,
                                                                                 device=inputs_embeds.device, 
                                                                                 dtype=inputs_embeds.dtype)
            else:
                self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
                self.past_G_values=None
                self.is_past_G_values_initialized=True

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = create_causal_mask(
            config=self.config,
            input_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
            position_ids=position_ids
        )

        hidden_states=inputs_embeds
        # create position embeddings to be shared across the decoder layers
        if not self.reference_rope:
            position_embeddings = self.rotary_embedding(hidden_states, position_ids)
        else:
            # defer reference rope initialization in the PldrllmAttention module.
            position_embeddings=None

        hidden_states *= torch.sqrt(torch.tensor(self.d_model).to(dtype=hidden_states.dtype))

        hidden_states=self.layernorm1(hidden_states)

        for i in range(self.num_layers):
            hidden_states, dec_att_w= self.dec_layers[i](hidden_states, 
                                                         causal_mask,
                                                         position_embeddings=position_embeddings,
                                                         position_ids=position_ids,
                                                         cache_position=cache_position,
                                                         use_cache=use_cache,
                                                         past_key_values=past_key_values,
                                                         past_G_values=self.past_G_values,
                                                         past_G_values_status=self.past_G_values_status,
                                                         **kwargs
                                                         )

            if output_pldr_attentions:
                dec_att_weights += (dec_att_w,)

            if output_attentions:
                dec_attentions += (dec_att_w[-1],)

            if output_hidden_states:
                dec_outputs += (hidden_states,)

        last_hidden_state=hidden_states

        return BasePLDRModelOutputWithPast(
            last_hidden_state = last_hidden_state,
            past_key_values=past_key_values if use_cache else None,
            hidden_states=dec_outputs,
            attentions=dec_attentions,
            pldr_attentions=dec_att_weights
        )

    def get_input_embeddings(self):
        return self.embedding

    def set_input_embeddings(self, value):
        self.embedding = value

@auto_docstring(custom_intro="""
                Large Language Model From Power Law Decoder Representations (PLDR-LLM) with LM Head as final layer.
                PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
                For details of model architecture, check out these papers:
                [Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
                """
                )
class PldrllmForCausalLM(PldrllmPreTrainedModel, GenerationMixin):
    def __init__(self, config: PldrllmConfig)->None:
        super().__init__(config)

        self.d_model=config.hidden_size
        self.input_vocab_size =config.vocab_size
        self.final_bias=config.final_bias
        self.pldr_device=None
        self.decoder=PldrllmModel(config=config)
        self.wdtype=None

        self.final_layer = nn.Linear(self.d_model, self.input_vocab_size, bias=self.final_bias, device=self.pldr_device, dtype=self.wdtype)

        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.embedding


    def set_input_embeddings(self, value):
        self.decoder.embedding = value

    def get_output_embeddings(self):
        return self.final_layer

    def set_output_embeddings(self, new_embeddings):
        self.final_layer = new_embeddings

    def set_decoder(self, decoder):
        self.decoder = decoder

    def get_decoder(self):
        return self.decoder

    @can_return_tuple
    @auto_docstring(
        custom_args=MODEL_COMMON_CUSTOM_ARGS
    )
    def forward(self, 
                input_ids: Optional[torch.LongTensor] = None,
                attention_mask: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.LongTensor] = None,
                past_key_values: Optional[Cache]=None,
                use_cache: Optional[bool] = None,
                inputs_embeds: Optional[torch.FloatTensor] = None,
                labels: Optional[torch.LongTensor] = None,
                output_attentions: Optional[bool] = None,
                output_pldr_attentions: Optional[bool] = None,
                output_hidden_states: Optional[bool] = None,
                cache_position: Optional[torch.LongTensor] = None,
                cache_first_G: Optional[bool] = None,
                logits_to_keep: Union[int, torch.Tensor] = 0,
                **kwargs: Unpack[TransformersKwargs],
                )-> CausalPLDRLLMOutputWithPast:

        outputs: BasePLDRModelOutputWithPast=self.decoder(input_ids=input_ids,
                                                          attention_mask=attention_mask,
                                                          position_ids=position_ids,
                                                          past_key_values=past_key_values,
                                                          use_cache=use_cache,
                                                          inputs_embeds=inputs_embeds,
                                                          output_attentions=output_attentions,
                                                          output_pldr_attentions=output_pldr_attentions,
                                                          output_hidden_states=output_hidden_states,
                                                          cache_position=cache_position,
                                                          cache_first_G=cache_first_G,
                                                          **kwargs
                                                         )


        hidden_states = outputs.last_hidden_state
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep        
        logits = self.final_layer(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)

        return CausalPLDRLLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions= outputs.attentions, #list of E
            pldr_attentions=outputs.pldr_attentions
        )

@auto_docstring
class PldrllmForTokenClassification(PldrllmPreTrainedModel):
    def __init__(self, config:PldrllmConfig)->None:
        super().__init__(config)
        self.num_labels = config.num_labels
        self.decoder = PldrllmModel(config)
        self.wdtype=None
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=self.wdtype)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.embedding

    def set_input_embeddings(self, value):
        self.decoder.embedding = value

    @can_return_tuple
    @auto_docstring(
        custom_args=MODEL_COMMON_CUSTOM_ARGS
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_pldr_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        cache_first_G: Optional[bool] = None,
    ) -> TokenClassifierPLDRLLMOutput:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        outputs: BasePLDRModelOutputWithPast = self.decoder(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_pldr_attentions=output_pldr_attentions,
            cache_first_G=cache_first_G
        )
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.config)

        return TokenClassifierPLDRLLMOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            pldr_attentions=outputs.pldr_attentions
        )


@auto_docstring
class PldrllmForQuestionAnswering(PldrllmPreTrainedModel):

    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama->Pldrllm
    def __init__(self, config:PldrllmConfig):
        super().__init__(config)
        self.decoder = PldrllmModel(config)
        self.wdtype=None
        self.qa_outputs = nn.Linear(config.hidden_size, 2, bias=True, dtype=self.wdtype)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.embedding

    def set_input_embeddings(self, value):
        self.decoder.embedding = value

    @can_return_tuple
    @auto_docstring(
        custom_args=MODEL_COMMON_CUSTOM_ARGS
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_pldr_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        cache_first_G: Optional[bool] = None,
        **kwargs,
    ) -> QuestionAnsweringPLDRModelOutput:
        outputs: BasePLDRModelOutputWithPast = self.decoder(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_pldr_attentions=output_pldr_attentions,
            cache_first_G=cache_first_G
        )

        sequence_output = outputs.last_hidden_state

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        loss = None
        if start_positions is not None and end_positions is not None:
            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

        return QuestionAnsweringPLDRModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            pldr_attentions=outputs.pldr_attentions
        )

@auto_docstring(
    custom_intro="""
    The PLDR-LLM with a sequence classification head on top (linear layer).

    [`PldrllmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """
)
class PldrllmForSequenceClassification(PldrllmPreTrainedModel):
    def __init__(self, config:PldrllmConfig)->None:
        super().__init__(config)
        self.num_labels = config.num_labels
        self.decoder = PldrllmModel(config)
        self.wdtype=None
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False, dtype=self.wdtype)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.decoder.embedding

    def set_input_embeddings(self, value):
        self.decoder.embedding = value

    @can_return_tuple
    @auto_docstring(
        custom_args=MODEL_COMMON_CUSTOM_ARGS
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_pldr_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        cache_first_G: Optional[bool] = None
    ) -> SequenceClassifierPLDRLLMOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        outputs: BasePLDRModelOutputWithPast = self.decoder(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_pldr_attentions=output_pldr_attentions,
            output_hidden_states=output_hidden_states,
            cache_first_G=cache_first_G
        )
        hidden_states = outputs.last_hidden_state
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
            logger.warning_once(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        return SequenceClassifierPLDRLLMOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            pldr_attentions=outputs.pldr_attentions
        )


__all__ = [
    "PldrllmForCausalLM",
    "PldrllmModel",
    "PldrllmPreTrainedModel",
    "PldrllmForTokenClassification",
    "PldrllmForQuestionAnswering",
    "PldrllmForSequenceClassification"
]