Spaces:

1inkusFace
/

StableDiffusion-3.5-Large-lora

Paused

App Files Files Community

1inkusFace commited on Oct 13

Commit

413e290

verified ·

1 Parent(s): 58cc351

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -10

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import subprocess
 subprocess.run(['sh', './spaces.sh'])
 import os
-# Environment variable setup
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
 os.environ['TORCH_LINALG_PREFER_CUSOLVER'] = '1'
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,pinned_use_background_threads:True'
@@ -17,7 +17,6 @@ import datetime
 import threading
 import io
-# --- New GCS Imports ---
 from google.oauth2 import service_account
 from google.cloud import storage
@@ -27,9 +26,8 @@ import torch
 def install_flashattn():
     subprocess.run(['sh', './flashattn.sh'])
-#install_flashattn()
-# Torch performance settings
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
@@ -45,13 +43,10 @@ from PIL import Image
 from image_gen_aux import UpscaleWithModel
-# --- GCS Configuration ---
-# Make sure to set these secrets in your Hugging Face Space settings
 GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
 GCS_SA_KEY = os.getenv("GCS_SA_KEY") # The full JSON key content as a string
-# Initialize GCS client if credentials are available
 gcs_client = None
 if GCS_SA_KEY and GCS_BUCKET_NAME:
     try:
         credentials_info = eval(GCS_SA_KEY) # Using eval is safe here if you trust the secret source
@@ -79,6 +74,50 @@ def upload_to_gcs(image_object, filename):
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 @spaces.GPU(duration=120)
 def compile_transformer():
     with spaces.aoti_capture(pipe.transformer) as call:
@@ -106,13 +145,18 @@ def load_model():
 pipe, upscaler_2 = load_model()
 compiled_transformer = compile_transformer()
 spaces.aoti_apply(compiled_transformer, pipe.transformer)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 4096
 @spaces.GPU(duration=45)
 def generate_images_30(prompt, neg_prompt_1, neg_prompt_2, neg_prompt_3, width, height, guidance, steps, progress=gr.Progress(track_tqdm=True)):
     seed = random.randint(0, MAX_SEED)
@@ -234,6 +278,7 @@ css = """
 #col-container {margin: 0 auto;max-width: 640px;}
 body{background-color: blue;}
 """
 with gr.Blocks(theme=gr.themes.Origin(), css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(" # StableDiffusion 3.5 Large with UltraReal lora test")
@@ -310,7 +355,6 @@ with gr.Blocks(theme=gr.themes.Origin(), css=css) as demo:
             ],
             outputs=[result, expanded_prompt_output],
         )
 if __name__ == "__main__":
     demo.launch()

 subprocess.run(['sh', './spaces.sh'])
 import os
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
 os.environ['TORCH_LINALG_PREFER_CUSOLVER'] = '1'
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,pinned_use_background_threads:True'
 import threading
 import io
 from google.oauth2 import service_account
 from google.cloud import storage
 def install_flashattn():
     subprocess.run(['sh', './flashattn.sh'])
+install_flashattn()
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
 from image_gen_aux import UpscaleWithModel
 GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
 GCS_SA_KEY = os.getenv("GCS_SA_KEY") # The full JSON key content as a string
 gcs_client = None
 if GCS_SA_KEY and GCS_BUCKET_NAME:
     try:
         credentials_info = eval(GCS_SA_KEY) # Using eval is safe here if you trust the secret source
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+from diffusers.models.attention_processor import AttnProcessor2_0
+from kernels import get_kernel
+fa3_kernel = get_kernel("kernels-community/flash-attn3") # Or vllm-flash-attn3
+class FlashAttentionProcessor(AttnProcessor2_0):
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None, # This will be present for cross-attention
+        attention_mask=None,
+        temb=None, # This might be present in some attention mechanisms, pass through if not used directly
+        **kwargs,
+    ):
+        # Determine if it's self-attention or cross-attention
+        # For self-attention, encoder_hidden_states is None or identical to hidden_states
+        is_cross_attention = encoder_hidden_states is not None and encoder_hidden_states.shape[1] != hidden_states.shape[1]
+        # SD3.5 uses DiT, where hidden_states are often 3D (B, Seq, Dim)
+        # However, attention can be within a transformer block which might internally reshape.
+        # Ensure your inputs (query, key, value) are properly shaped for the kernel.
+        # The kernel expects (Batch, Heads, Sequence, Dim_Head)
+        query = attn.to_q(hidden_states)
+        if is_cross_attention:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+        else: # Self-attention
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+        scale = attn.scale
+        query = query * scale
+        b, t, c = query.shape # B=batch_size, T=sequence_length, C=embedding_dim
+        h = attn.heads
+        d = c // h # dim_per_head
+        # Reshape to (Batch, Heads, Sequence, Dim_Head) for Flash Attention kernel
+        q_reshaped = query.reshape(b, t, h, d).permute(0, 2, 1, 3)
+        k_reshaped = key.reshape(b, t, h, d).permute(0, 2, 1, 3)
+        v_reshaped = value.reshape(b, t, h, d).permute(0, 2, 1, 3)
+        out_reshaped = torch.empty_like(q_reshaped)
+        # Call the Flash Attention kernel
+        fa3_kernel.attention(q_reshaped, k_reshaped, v_reshaped, out_reshaped)
+        # Reshape output back to (Batch, Sequence, Heads * Dim_Head)
+        out = out_reshaped.permute(0, 2, 1, 3).reshape(b, t, c)
+        out = attn.to_out(out)
+        return out
 @spaces.GPU(duration=120)
 def compile_transformer():
     with spaces.aoti_capture(pipe.transformer) as call:
 pipe, upscaler_2 = load_model()
+fa_processor = FlashAttentionProcessor()
+for name, module in pipe.transformer.named_modules():
+    if isinstance(module, AttnProcessor2_0):
+        module.processor = fa_processor
 compiled_transformer = compile_transformer()
 spaces.aoti_apply(compiled_transformer, pipe.transformer)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 4096
 @spaces.GPU(duration=45)
 def generate_images_30(prompt, neg_prompt_1, neg_prompt_2, neg_prompt_3, width, height, guidance, steps, progress=gr.Progress(track_tqdm=True)):
     seed = random.randint(0, MAX_SEED)
 #col-container {margin: 0 auto;max-width: 640px;}
 body{background-color: blue;}
 """
 with gr.Blocks(theme=gr.themes.Origin(), css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(" # StableDiffusion 3.5 Large with UltraReal lora test")
             ],
             outputs=[result, expanded_prompt_output],
         )
 if __name__ == "__main__":
     demo.launch()