Flux-Compiled-Graph

Sleeping

App Files Files Community

sayakpaul HF Staff commited on Sep 11

Commit

7d0a465

verified ·

1 Parent(s): 3625a6b

Update fa3.py

Browse files

Files changed (1) hide show

fa3.py +73 -25

fa3.py CHANGED Viewed

@@ -1,14 +1,67 @@
 import torch
 from kernels import get_kernel
 _flash_attn_func = get_kernel("kernels-community/vllm-flash-attn3").flash_attn_func
 @torch.library.custom_op("flash::flash_attn_func", mutates_args=())
-def flash_attn_func(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-    outputs, lse = _flash_attn_func(q, k, v)
-    return outputs
 @flash_attn_func.register_fake
 def _(q, k, v, **kwargs):
@@ -16,26 +69,26 @@ def _(q, k, v, **kwargs):
     # 1. output: (batch, seq_len, num_heads, head_dim)
     # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
     meta_q = torch.empty_like(q).contiguous()
-    return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
-# Copied FusedFluxAttnProcessor2_0 but using flash v3 instead of SDPA
-class FlashFusedFluxAttnProcessor3_0:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
     def __call__(
         self,
         attn,
         hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor | None = None,
-        attention_mask: torch.FloatTensor | None = None,
-        image_rotary_emb: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         # `sample` projections.
-        qkv = attn.to_qkv(hidden_states)
-        split_size = qkv.shape[-1] // 3
-        query, key, value = torch.split(qkv, split_size, dim=-1)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
@@ -52,13 +105,9 @@ class FlashFusedFluxAttnProcessor3_0:
         # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
         # `context` projections.
         if encoder_hidden_states is not None:
-            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
-            split_size = encoder_qkv.shape[-1] // 3
-            (
-                encoder_hidden_states_query_proj,
-                encoder_hidden_states_key_proj,
-                encoder_hidden_states_value_proj,
-            ) = torch.split(encoder_qkv, split_size, dim=-1)
             encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
                 batch_size, -1, attn.heads, head_dim
@@ -87,10 +136,9 @@ class FlashFusedFluxAttnProcessor3_0:
             key = apply_rotary_emb(key, image_rotary_emb)
         # NB: transposes are necessary to match expected SDPA input shape
-        hidden_states = flash_attn_func(
-            query.transpose(1, 2),
-            key.transpose(1, 2),
-            value.transpose(1, 2))[0].transpose(1, 2)
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
@@ -109,4 +157,4 @@ class FlashFusedFluxAttnProcessor3_0:
             return hidden_states, encoder_hidden_states
         else:
-            return hidden_states

+"""
+Adapted from
+https://github.com/huggingface/flux-fast/blob/156281514e2725782ffab9431d4004840f7e3b4d/utils/pipeline_utils.py#L87
+"""
+import torch
+from typing import List, Optional
+import inspect
 import torch
 from kernels import get_kernel
 _flash_attn_func = get_kernel("kernels-community/vllm-flash-attn3").flash_attn_func
 @torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    # probably wrong type for these 4
+    qv: Optional[float] = None,
+    q_descale: Optional[float] = None,
+    k_descale: Optional[float] = None,
+    v_descale: Optional[float] = None,
+    window_size: Optional[List[int]] = None,
+    sink_token_length: int = 0,
+    softcap: float = 0.0,
+    num_splits: int = 1,
+    # probably wrong type for this too
+    pack_gqa: Optional[float] = None,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+) -> torch.Tensor:  # Tuple[torch.Tensor, torch.Tensor]:
+    if window_size is None:
+        window_size = (-1, -1)
+    else:
+        window_size = tuple(window_size)
+    sig = inspect.signature(_flash_attn_func)
+    accepted = set(sig.parameters)
+    all_kwargs = {
+        "softmax_scale": softmax_scale,
+        "causal": causal,
+        "qv": qv,
+        "q_descale": q_descale,
+        "k_descale": k_descale,
+        "v_descale": v_descale,
+        "window_size": window_size,
+        "sink_token_length": sink_token_length,
+        "softcap": softcap,
+        "num_splits": num_splits,
+        "pack_gqa": pack_gqa,
+        "deterministic": deterministic,
+        "sm_margin": sm_margin,
+    }
+    kwargs = {k: v for k, v in all_kwargs.items() if k in accepted}
+    outputs = _flash_attn_func(q, k, v, **kwargs)
+    return outputs[0]
 @flash_attn_func.register_fake
 def _(q, k, v, **kwargs):
     # 1. output: (batch, seq_len, num_heads, head_dim)
     # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
     meta_q = torch.empty_like(q).contiguous()
+    return meta_q  # , q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
+class FlashFluxAttnProcessor3_0:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
     def __call__(
         self,
         attn,
         hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
         # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
         # `context` projections.
         if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
             encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
                 batch_size, -1, attn.heads, head_dim
             key = apply_rotary_emb(key, image_rotary_emb)
         # NB: transposes are necessary to match expected SDPA input shape
+        hidden_states = flash_attn_func(query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2))[
+            0
+        ].transpose(1, 2)
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
             return hidden_states, encoder_hidden_states
         else:
+            return hidden_states