Spaces:

LiuZichen
/

MagicQuillV2

Running on Zero

App Files Files Community

LiuZichen commited on 16 days ago

Commit

f460ce6

1 Parent(s): c57bc42

update

Browse files

Files changed (41) hide show

README.md +2 -2
app.py +439 -4
edit_space.py +461 -0
requirements.txt +28 -0
src/__init__.py +0 -0
src/layers_cache.py +406 -0
src/lora_helper.py +194 -0
src/pipeline_flux_kontext_control.py +1230 -0
src/transformer_flux.py +608 -0
train/default_config.yaml +16 -0
train/src/__init__.py +0 -0
train/src/condition/edge_extraction.py +356 -0
train/src/condition/hed.py +56 -0
train/src/condition/informative_drawing.py +279 -0
train/src/condition/lineart.py +86 -0
train/src/condition/pidi.py +681 -0
train/src/condition/ted.py +296 -0
train/src/condition/util.py +202 -0
train/src/generate_diff_mask.py +301 -0
train/src/jsonl_datasets_kontext_color.py +166 -0
train/src/jsonl_datasets_kontext_complete_lora.py +363 -0
train/src/jsonl_datasets_kontext_edge.py +225 -0
train/src/jsonl_datasets_kontext_interactive_lora.py +1332 -0
train/src/jsonl_datasets_kontext_local.py +312 -0
train/src/layers.py +279 -0
train/src/lora_helper.py +196 -0
train/src/masks_integrated.py +322 -0
train/src/pipeline_flux_kontext_control.py +1009 -0
train/src/prompt_helper.py +205 -0
train/src/transformer_flux.py +625 -0
train/train_kontext_color.py +858 -0
train/train_kontext_color.sh +25 -0
train/train_kontext_complete_lora.sh +20 -0
train/train_kontext_edge.py +814 -0
train/train_kontext_edge.sh +25 -0
train/train_kontext_interactive_lora.sh +18 -0
train/train_kontext_local.py +876 -0
train/train_kontext_local.sh +26 -0
train/train_kontext_lora.py +871 -0
util.py +188 -0
utils_node.py +199 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: MagicQuillV2
-emoji: 🏆
 colorFrom: blue
 colorTo: blue
 sdk: gradio
-sdk_version: 6.0.1
 app_file: app.py
 pinned: false
 ---

 ---
 title: MagicQuillV2
+emoji: 🪶
 colorFrom: blue
 colorTo: blue
 sdk: gradio
+sdk_version: 5.4.0
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,7 +1,442 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import sys
+import os
 import gradio as gr
+import spaces
+import tempfile
+import numpy as np
+import io
+import base64
+from gradio_client import Client, handle_file
+from huggingface_hub import snapshot_download
+from gradio_magicquillv2 import MagicQuillV2
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+import requests
+from PIL import Image, ImageOps
+import random
+import time
+import torch
+import json
+# Try importing as a package (recommended)
+from edit_space import KontextEditModel
+from util import (
+    load_and_preprocess_image,
+    read_base64_image as read_base64_image_utils,
+    create_alpha_mask,
+    tensor_to_base64,
+    get_mask_bbox
+)
+# Initialize models
+print("Downloading models...")
+hf_token = os.environ.get("hf_token")
+snapshot_download(repo_id="LiuZichen/MagicQuillV2-models", repo_type="model", local_dir="models", token=hf_token)
+print("Initializing models...")
+kontext_model = KontextEditModel()
+# Initialize SAM Client
+# Replace with your actual SAM Space ID
+sam_client = Client("LiuZichen/MagicQuillHelper")
+print("Models initialized.")
+css = """
+.ms {
+    width: 60%;
+    margin: auto
+}
+"""
+url = "http://localhost:7860"
+@spaces.GPU
+def generate(merged_image, total_mask, original_image, add_color_image, add_edge_mask, remove_edge_mask, fill_mask, add_prop_image, positive_prompt, negative_prompt, fine_edge, fix_perspective, grow_size, edge_strength, color_strength, local_strength, seed, steps, cfg):
+    print("prompt is:", positive_prompt)
+    print("other parameters:", negative_prompt, fine_edge, fix_perspective, grow_size, edge_strength, color_strength, local_strength, seed, steps, cfg)
+    if kontext_model is None:
+        raise RuntimeError("KontextEditModel not initialized")
+    # Preprocess inputs
+    # utils.read_base64_image returns BytesIO, which create_alpha_mask accepts (via Image.open)
+    # load_and_preprocess_image accepts path, so we might need to check if it accepts file-like object.
+    # utils.load_and_preprocess_image uses Image.open(image_path), so BytesIO works.
+    merged_image_tensor = load_and_preprocess_image(read_base64_image_utils(merged_image))
+    total_mask_tensor = create_alpha_mask(read_base64_image_utils(total_mask))
+    original_image_tensor = load_and_preprocess_image(read_base64_image_utils(original_image))
+    if add_color_image:
+        add_color_image_tensor = load_and_preprocess_image(read_base64_image_utils(add_color_image))
+    else:
+        add_color_image_tensor = original_image_tensor
+    add_mask = create_alpha_mask(read_base64_image_utils(add_edge_mask)) if add_edge_mask else torch.zeros_like(total_mask_tensor)
+    remove_mask = create_alpha_mask(read_base64_image_utils(remove_edge_mask)) if remove_edge_mask else torch.zeros_like(total_mask_tensor)
+    add_prop_mask = create_alpha_mask(read_base64_image_utils(add_prop_image)) if add_prop_image else torch.zeros_like(total_mask_tensor)
+    fill_mask_tensor = create_alpha_mask(read_base64_image_utils(fill_mask)) if fill_mask else torch.zeros_like(total_mask_tensor)
+    # Determine flag and modify prompt
+    flag = "kontext"
+    if torch.sum(add_prop_mask) > 0:
+        flag = "foreground"
+        positive_prompt = "Fill in the white region naturally and adapt the foreground into the background. Fix the perspective of the foreground object if necessary. " + positive_prompt
+    elif torch.sum(fill_mask_tensor).item() > 0:
+        flag = "local"
+    elif (torch.sum(remove_mask).item() > 0 and torch.sum(add_mask).item() == 0):
+        positive_prompt = "remove the instance"
+        flag = "removal"
+    elif (torch.sum(add_mask).item() > 0 or torch.sum(remove_mask).item() > 0 or (not torch.equal(original_image_tensor, add_color_image_tensor))):
+        flag = "precise_edit"
+    print("positive prompt: ", positive_prompt)
+    print("current flag: ", flag)
+    final_image, condition, mask = kontext_model.process(
+        original_image_tensor,
+        add_color_image_tensor,
+        merged_image_tensor,
+        positive_prompt,
+        total_mask_tensor,
+        add_mask,
+        remove_mask,
+        add_prop_mask,
+        fill_mask_tensor,
+        fine_edge,
+        fix_perspective,
+        edge_strength,
+        color_strength,
+        local_strength,
+        grow_size,
+        seed,
+        steps,
+        cfg,
+        flag,
+    )
+    # tensor_to_base64 returns pure base64 string
+    res_base64 = tensor_to_base64(final_image)
+    return res_base64
+def generate_image_handler(x, negative_prompt, fine_edge, fix_perspective, grow_size, edge_strength, color_strength, local_strength, seed, steps, cfg):
+    merged_image = x['from_frontend']['img']
+    total_mask = x['from_frontend']['total_mask']
+    original_image = x['from_frontend']['original_image']
+    add_color_image = x['from_frontend']['add_color_image']
+    add_edge_mask = x['from_frontend']['add_edge_mask']
+    remove_edge_mask = x['from_frontend']['remove_edge_mask']
+    fill_mask = x['from_frontend']['fill_mask']
+    add_prop_image = x['from_frontend']['add_prop_image']
+    positive_prompt = x['from_backend']['prompt']
+    try:
+        res_base64 = generate(
+            merged_image,
+            total_mask,
+            original_image,
+            add_color_image,
+            add_edge_mask,
+            remove_edge_mask,
+            fill_mask,
+            add_prop_image,
+            positive_prompt,
+            negative_prompt,
+            fine_edge,
+            fix_perspective,
+            grow_size,
+            edge_strength,
+            color_strength,
+            local_strength,
+            seed,
+            steps,
+            cfg
+        )
+        x["from_backend"]["generated_image"] = res_base64
+    except Exception as e:
+        print(f"Error in generation: {e}")
+        x["from_backend"]["generated_image"] = None
+    return x
+with gr.Blocks(title="MagicQuill V2") as demo:
+    with gr.Row():
+        ms = MagicQuillV2()
+    with gr.Row():
+        with gr.Column():
+            btn = gr.Button("Run", variant="primary")
+        with gr.Column():
+            with gr.Accordion("parameters", open=False):
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value="",
+                    interactive=True
+                )
+                fine_edge = gr.Radio(
+                    label="Fine Edge",
+                    choices=['enable', 'disable'],
+                    value='disable',
+                    interactive=True
+                )
+                fix_perspective = gr.Radio(
+                    label="Fix Perspective",
+                    choices=['enable', 'disable'],
+                    value='disable',
+                    interactive=True
+                )
+                grow_size = gr.Slider(
+                    label="Grow Size",
+                    minimum=10,
+                    maximum=100,
+                    value=50,
+                    step=1,
+                    interactive=True
+                )
+                edge_strength = gr.Slider(
+                    label="Edge Strength",
+                    minimum=0.0,
+                    maximum=5.0,
+                    value=0.6,
+                    step=0.01,
+                    interactive=True
+                )
+                color_strength = gr.Slider(
+                    label="Color Strength",
+                    minimum=0.0,
+                    maximum=5.0,
+                    value=1.5,
+                    step=0.01,
+                    interactive=True
+                )
+                local_strength = gr.Slider(
+                    label="Local Strength",
+                    minimum=0.0,
+                    maximum=5.0,
+                    value=1.0,
+                    step=0.01,
+                    interactive=True
+                )
+                seed = gr.Number(
+                    label="Seed",
+                    value=-1,
+                    precision=0,
+                    interactive=True
+                )
+                steps = gr.Slider(
+                    label="Steps",
+                    minimum=0,
+                    maximum=50,
+                    value=20,
+                    interactive=True
+                )
+                cfg = gr.Slider(
+                    label="CFG",
+                    minimum=0.0,
+                    maximum=20.0,
+                    value=3.5,
+                    step=0.1,
+                    interactive=True
+                )
+        btn.click(generate_image_handler, inputs=[ms, negative_prompt, fine_edge, fix_perspective, grow_size, edge_strength, color_strength, local_strength, seed, steps, cfg], outputs=ms)
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def get_root_url(
+    request: Request, route_path: str, root_path: str | None
+):
+    print(root_path)
+    return root_path
+import gradio.route_utils
+gr.route_utils.get_root_url = get_root_url
+gr.mount_gradio_app(app, demo, path="/demo", root_path="/demo")
+@app.post("/magic_quill/generate_image")
+async def generate_image(request: Request):
+    data = await request.json()
+    res = generate(
+        data["merged_image"],
+        data["total_mask"],
+        data["original_image"],
+        data["add_color_image"],
+        data["add_edge_mask"],
+        data["remove_edge_mask"],
+        data["fill_mask"],
+        data["add_prop_image"],
+        data["positive_prompt"],
+        data["negative_prompt"],
+        data["fine_edge"],
+        data["fix_perspective"],
+        data["grow_size"],
+        data["edge_strength"],
+        data["color_strength"],
+        data["local_strength"],
+        data["seed"],
+        data["steps"],
+        data["cfg"]
+    )
+    return {'res': res}
+@app.post("/magic_quill/process_background_img")
+async def process_background_img(request: Request):
+    img = await request.json()
+    from util import process_background
+    # process_background returns tensor [1, H, W, 3] in uint8 or float
+    resized_img_tensor = process_background(img)
+    # tensor_to_base64 from util expects tensor
+    resized_img_base64 = "data:image/webp;base64," + tensor_to_base64(
+        resized_img_tensor,
+        quality=80,
+        method=6
+    )
+    return resized_img_base64
+@app.post("/magic_quill/segmentation")
+async def segmentation(request: Request):
+    json_data = await request.json()
+    image_base64 = json_data.get("image", None)
+    coordinates_positive = json_data.get("coordinates_positive", None)
+    coordinates_negative = json_data.get("coordinates_negative", None)
+    bboxes = json_data.get("bboxes", None)
+    if sam_client is None:
+        return {"error": "sam client not initialized"}
+    # Process coordinates and bboxes
+    pos_coordinates = None
+    if coordinates_positive and len(coordinates_positive) > 0:
+        pos_coordinates = []
+        for coord in coordinates_positive:
+            coord['x'] = int(round(coord['x']))
+            coord['y'] = int(round(coord['y']))
+            pos_coordinates.append({'x': coord['x'], 'y': coord['y']})
+        pos_coordinates = json.dumps(pos_coordinates)
+    neg_coordinates = None
+    if coordinates_negative and len(coordinates_negative) > 0:
+        neg_coordinates = []
+        for coord in coordinates_negative:
+            coord['x'] = int(round(coord['x']))
+            coord['y'] = int(round(coord['y']))
+            neg_coordinates.append({'x': coord['x'], 'y': coord['y']})
+        neg_coordinates = json.dumps(neg_coordinates)
+    bboxes_xyxy = None
+    if bboxes and len(bboxes) > 0:
+        valid_bboxes = []
+        for bbox in bboxes:
+            if (bbox.get("startX") is None or
+                bbox.get("startY") is None or
+                bbox.get("endX") is None or
+                bbox.get("endY") is None):
+                continue
+            else:
+                x_min = max(min(int(bbox["startX"]), int(bbox["endX"])), 0)
+                y_min = max(min(int(bbox["startY"]), int(bbox["endY"])), 0)
+                # Note: image_tensor not available here easily without loading image,
+                # but usually we don't need to clip strictly if SAM handles it or we clip to large values
+                # For now, we skip strict clipping against image dims or assume 10000
+                x_max = int(bbox["startX"]) if int(bbox["startX"]) > int(bbox["endX"]) else int(bbox["endX"])
+                y_max = int(bbox["startY"]) if int(bbox["startY"]) > int(bbox["endY"]) else int(bbox["endY"])
+                valid_bboxes.append((x_min, y_min, x_max, y_max))
+        bboxes_xyxy = []
+        for bbox in valid_bboxes:
+            x_min, y_min, x_max, y_max = bbox
+            bboxes_xyxy.append((x_min, y_min, x_max, y_max))
+        # Convert to JSON string if that's what the client expects, or keep as list
+        # Assuming JSON string for consistency with coords
+        if bboxes_xyxy:
+            bboxes_xyxy = json.dumps(bboxes_xyxy)
+    print(f"Segmentation request: pos={pos_coordinates}, neg={neg_coordinates}, bboxes={bboxes_xyxy}")
+    try:
+        # Save base64 image to temp file
+        image_bytes = read_base64_image_utils(image_base64)
+        # Image.open to verify and save as WebP (smaller size)
+        pil_image = Image.open(image_bytes)
+        with tempfile.NamedTemporaryFile(suffix=".webp", delete=False) as temp_in:
+            pil_image.save(temp_in.name, format="WEBP", quality=80)
+            temp_in_path = temp_in.name
+        # Execute segmentation via Client
+        # We assume the remote space returns a filepath to the segmented image (with alpha)
+        # NOW it returns mask_np image
+        result_path = sam_client.predict(
+            handle_file(temp_in_path),
+            pos_coordinates,
+            neg_coordinates,
+            bboxes_xyxy,
+            api_name="/segment"
+        )
+        # Clean up input temp
+        os.unlink(temp_in_path)
+        # Process result
+        # result_path should be a generic object, usually a tuple (image_path, mask_path) or just image_path
+        # Depending on how the remote space is implemented.
+        if isinstance(result_path, (list, tuple)):
+            result_path = result_path[0] # Take the first return value if multiple
+        if not result_path or not os.path.exists(result_path):
+             raise RuntimeError("Client returned invalid result path")
+        # result_path is the Mask Image (White=Selected, Black=Background)
+        mask_pil = Image.open(result_path)
+        if mask_pil.mode != 'L':
+            mask_pil = mask_pil.convert('L')
+        pil_image = pil_image.convert("RGB")
+        if pil_image.size != mask_pil.size:
+            mask_pil = mask_pil.resize(pil_image.size, Image.NEAREST)
+        r, g, b = pil_image.split()
+        res_pil = Image.merge("RGBA", (r, g, b, mask_pil))
+        # Extract bbox from mask (alpha)
+        mask_tensor = torch.from_numpy(np.array(mask_pil) / 255.0).float().unsqueeze(0)
+        mask_bbox = get_mask_bbox(mask_tensor)
+        if mask_bbox:
+            x_min, y_min, x_max, y_max = mask_bbox
+            seg_bbox = {'startX': x_min, 'startY': y_min, 'endX': x_max, 'endY': y_max}
+        else:
+            seg_bbox = {'startX': 0, 'startY': 0, 'endX': 0, 'endY': 0}
+        print(seg_bbox)
+        # Convert result to base64
+        # We need to convert the PIL image to base64 string
+        buffered = io.BytesIO()
+        res_pil.save(buffered, format="PNG")
+        image_base64_res = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return {
+            "error": False,
+            "segmentation_image": "data:image/png;base64," + image_base64_res,
+            "segmentation_bbox": seg_bbox
+        }
+    except Exception as e:
+        print(f"Error in segmentation: {e}")
+        return {"error": str(e)}
+app = gr.mount_gradio_app(app, demo, "/")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)
+    # demo.launch()

edit_space.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import torch.nn.functional as F
+import torch
+import sys
+import cv2
+import numpy as np
+from PIL import Image
+import json
+# New imports for the diffuser pipeline
+from src.pipeline_flux_kontext_control import FluxKontextControlPipeline
+from src.transformer_flux import FluxTransformer2DModel
+import tempfile
+from safetensors.torch import load_file, save_file
+_original_load_lora_weights = FluxKontextControlPipeline.load_lora_weights
+def _patched_load_lora_weights(self, pretrained_model_name_or_path_or_dict, **kwargs):
+    """自动转换混合格式的 LoRA 并添加 transformer 前缀"""
+    weight_name = kwargs.get("weight_name", "pytorch_lora_weights.safetensors")
+    if isinstance(pretrained_model_name_or_path_or_dict, str):
+        if os.path.isdir(pretrained_model_name_or_path_or_dict):
+            lora_file = os.path.join(pretrained_model_name_or_path_or_dict, weight_name)
+        else:
+            lora_file = pretrained_model_name_or_path_or_dict
+        if os.path.exists(lora_file):
+            state_dict = load_file(lora_file)
+            # 检查是否需要转换格式或添加前缀
+            needs_format_conversion = any('lora_A.weight' in k or 'lora_B.weight' in k for k in state_dict.keys())
+            needs_prefix = not any(k.startswith('transformer.') for k in state_dict.keys())
+            if needs_format_conversion or needs_prefix:
+                print(f"🔄 Processing LoRA: {lora_file}")
+                if needs_format_conversion:
+                    print(f"   - Converting PEFT format to diffusers format")
+                if needs_prefix:
+                    print(f"   - Adding 'transformer.' prefix to keys")
+                converted_state = {}
+                converted_count = 0
+                for key, value in state_dict.items():
+                    new_key = key
+                    # 步骤 1: 转换 PEFT 格式到 diffusers 格式
+                    if 'lora_A.weight' in new_key:
+                        new_key = new_key.replace('lora_A.weight', 'lora.down.weight')
+                        converted_count += 1
+                    elif 'lora_B.weight' in new_key:
+                        new_key = new_key.replace('lora_B.weight', 'lora.up.weight')
+                        converted_count += 1
+                    # 步骤 2: 添加 transformer 前缀（如果还没有的话）
+                    if not new_key.startswith('transformer.'):
+                        new_key = f'transformer.{new_key}'
+                    converted_state[new_key] = value
+                if needs_format_conversion:
+                    print(f"   ✅ Converted {converted_count} PEFT keys")
+                print(f"   ✅ Total keys: {len(converted_state)}")
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_file = os.path.join(temp_dir, weight_name)
+                    save_file(converted_state, temp_file)
+                    return _original_load_lora_weights(self, temp_dir, **kwargs)
+            else:
+                print(f"✅ LoRA already in correct format: {lora_file}")
+    # 不需要转换，使用原始方法
+    return _original_load_lora_weights(self, pretrained_model_name_or_path_or_dict, **kwargs)
+# 应用 monkey patch
+FluxKontextControlPipeline.load_lora_weights = _patched_load_lora_weights
+print("✅ Monkey patch applied to FluxKontextPipeline.load_lora_weights")
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+sys.path.append(os.path.abspath(os.path.join(current_dir, '..')))
+sys.path.append(os.path.abspath(os.path.join(current_dir, '..', '..', 'comfy_extras')))
+from train.src.condition.edge_extraction import InformativeDetector, HEDDetector
+from utils_node import BlendInpaint, JoinImageWithAlpha, GrowMask, InvertMask, ColorDetector
+TEST_MODE = False
+class KontextEditModel():
+    def __init__(self, base_model_path="/data0/lzc/FLUX.1-Kontext-dev", device="cuda",
+                 aux_lora_dir="models/v2_ckpt", easycontrol_base_dir="models/v2_ckpt",
+                 aux_lora_weight_name="puzzle_lora.safetensors",
+                 aux_lora_weight=1.0):
+        # Keep necessary preprocessors
+        self.mask_processor = GrowMask()
+        self.scribble_processor = HEDDetector.from_pretrained()
+        self.lineart_processor = InformativeDetector.from_pretrained()
+        self.color_processor = ColorDetector()
+        self.blender = BlendInpaint()
+        # Initialize the new pipeline (Kontext version)
+        self.device = device
+        self.pipe = FluxKontextControlPipeline.from_pretrained(base_model_path, torch_dtype=torch.bfloat16)
+        transformer = FluxTransformer2DModel.from_pretrained(
+            base_model_path,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            device=self.device
+        )
+        self.pipe.transformer = transformer
+        self.pipe.to(self.device, dtype=torch.bfloat16)
+        control_lora_config = {
+            "local": {
+                "path": os.path.join(easycontrol_base_dir, "local_lora.safetensors"),
+                "lora_weights": [1.0],
+                "cond_size": 512,
+            },
+            "removal": {
+                "path": os.path.join(easycontrol_base_dir, "removal_lora.safetensors"),
+                "lora_weights": [1.0],
+                "cond_size": 512,
+            },
+            "edge": {
+                "path": os.path.join(easycontrol_base_dir, "edge_lora.safetensors"),
+                "lora_weights": [1.0],
+                "cond_size": 512,
+            },
+            "color": {
+                "path": os.path.join(easycontrol_base_dir, "color_lora.safetensors"),
+                "lora_weights": [1.0],
+                "cond_size": 512,
+            },
+        }
+        self.pipe.load_control_loras(control_lora_config)
+        # Aux LoRA for foreground mode
+        self.aux_lora_weight_name = aux_lora_weight_name
+        self.aux_lora_dir = aux_lora_dir
+        self.aux_lora_weight = aux_lora_weight
+        self.aux_adapter_name = "aux"
+        from safetensors.torch import load_file as _sft_load
+        aux_path = os.path.join(self.aux_lora_dir, self.aux_lora_weight_name)
+        if os.path.isfile(aux_path):
+            self.pipe.load_lora_weights(aux_path, adapter_name=self.aux_adapter_name)
+            print(f"Loaded aux LoRA: {aux_path}")
+            # Ensure aux LoRA is disabled by default; it will be enabled only in foreground_edit
+            self._disable_aux_lora()
+        else:
+            print(f"Aux LoRA not found at {aux_path}, foreground mode will run without it.")
+    # gamma is now applied inside the pipeline based on control_dict
+    def _tensor_to_pil(self, tensor_image):
+        # Converts a ComfyUI-style tensor [1, H, W, 3] to a PIL Image
+        return Image.fromarray(np.clip(255. * tensor_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
+    def _pil_to_tensor(self, pil_image):
+        # Converts a PIL image to a ComfyUI-style tensor [1, H, W, 3]
+        return torch.from_numpy(np.array(pil_image).astype(np.float32) / 255.0).unsqueeze(0)
+    def clear_cache(self):
+        for name, attn_processor in self.pipe.transformer.attn_processors.items():
+            if hasattr(attn_processor, 'bank_kv'):
+                attn_processor.bank_kv.clear()
+            if hasattr(attn_processor, 'bank_attn'):
+                attn_processor.bank_attn = None
+    def _enable_aux_lora(self):
+        self.pipe.enable_lora()
+        self.pipe.set_adapters([self.aux_adapter_name], adapter_weights=[self.aux_lora_weight])
+        print(f"Enabled aux LoRA '{self.aux_adapter_name}' with weight {self.aux_lora_weight}")
+    def _disable_aux_lora(self):
+        self.pipe.disable_lora()
+        print("Disabled aux LoRA")
+    def _expand_mask(self, mask_tensor: torch.Tensor, expand: int = 0) -> torch.Tensor:
+        if expand <= 0:
+            return mask_tensor
+        expanded = self.mask_processor.expand_mask(mask_tensor, expand=expand, tapered_corners=True)[0]
+        return expanded
+    def _tensor_mask_to_pil3(self, mask_tensor: torch.Tensor) -> Image.Image:
+        mask_01 = torch.clamp(mask_tensor, 0.0, 1.0)
+        if mask_01.ndim == 3 and mask_01.shape[-1] == 3:
+            mask_01 = mask_01[..., 0]
+        if mask_01.ndim == 3 and mask_01.shape[0] == 1:
+            mask_01 = mask_01[0]
+        pil = self._tensor_to_pil(mask_01.unsqueeze(-1).repeat(1, 1, 3))
+        return pil
+    def _apply_black_mask(self, image_tensor: torch.Tensor, binary_mask: torch.Tensor) -> Image.Image:
+        # image_tensor: [1, H, W, 3] in [0,1]
+        # binary_mask: [H, W] or [1, H, W], 1=mask area (white)
+        if binary_mask.ndim == 3:
+            binary_mask = binary_mask[0]
+        mask_bool = (binary_mask > 0.5)
+        img = image_tensor.clone()
+        img[0][mask_bool] = 0.0
+        return self._tensor_to_pil(img)
+    def edge_edit(self,
+                image, colored_image, positive_prompt,
+                base_mask, add_mask, remove_mask,
+                fine_edge,
+                edge_strength, color_strength,
+                seed, steps, cfg):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        # Prepare mask and original image
+        original_image_tensor = image.clone()
+        original_mask = base_mask
+        original_mask = self._expand_mask(original_mask, expand=25)
+        image_pil = self._tensor_to_pil(image)
+        # image_pil.save("image_pil.png")
+        control_dict = {}
+        lineart_output = None
+        # Determine control type: color or edge
+        if not torch.equal(image, colored_image):
+            print("Apply color control")
+            colored_image_pil = self._tensor_to_pil(colored_image)
+            # Create color block condition
+            color_image_np = np.array(colored_image_pil)
+            downsampled = cv2.resize(color_image_np, (32, 32), interpolation=cv2.INTER_AREA)
+            upsampled = cv2.resize(downsampled, (256, 256), interpolation=cv2.INTER_NEAREST)
+            color_block = Image.fromarray(upsampled)
+            # Create grayscale condition
+            control_dict = {
+                "type": "color",
+                "spatial_images": [color_block],
+                "gammas": [color_strength]
+            }
+        else:
+            print("Apply edge control")
+            if fine_edge == "enable":
+                lineart_image = self.lineart_processor(np.array(self._tensor_to_pil(image.cpu().squeeze())), detect_resolution=1024, style="contour", output_type="pil")
+                lineart_output = self._pil_to_tensor(lineart_image)
+            else:
+                scribble_image = self.scribble_processor(np.array(self._tensor_to_pil(image.cpu().squeeze())), safe=True, resolution=512, output_type="pil")
+                lineart_output = self._pil_to_tensor(scribble_image)
+            if lineart_output is None:
+                raise ValueError("Preprocessor failed to generate lineart.")
+            # Apply user sketches to the lineart
+            add_mask_resized = F.interpolate(add_mask.unsqueeze(0).float(), size=(lineart_output.shape[1], lineart_output.shape[2]), mode='nearest').squeeze(0)
+            remove_mask_resized = F.interpolate(remove_mask.unsqueeze(0).float(), size=(lineart_output.shape[1], lineart_output.shape[2]), mode='nearest').squeeze(0)
+            bool_add_mask_resized = (add_mask_resized > 0.5)
+            bool_remove_mask_resized = (remove_mask_resized > 0.5)
+            lineart_output[bool_remove_mask_resized] = 0.0
+            lineart_output[bool_add_mask_resized] = 1.0
+            control_dict = {
+                "type": "edge",
+                "spatial_images": [self._tensor_to_pil(lineart_output)],
+                "gammas": [edge_strength]
+            }
+        # Prepare debug/output images
+        debug_image = lineart_output if lineart_output is not None else self.color_processor.execute(colored_image, resolution=1024)[0]
+        # Run inference
+        result_pil = self.pipe(
+            prompt=positive_prompt,
+            image=image_pil,
+            height=image_pil.height,
+            width=image_pil.width,
+            guidance_scale=cfg,
+            num_inference_steps=steps,
+            generator=generator,
+            max_sequence_length=128,
+            control_dict=control_dict,
+        ).images[0]
+        self.clear_cache()
+        # result_pil.save("result_pil.png")
+        result_tensor = self._pil_to_tensor(result_pil)
+        # final_image = self.blender.blend_inpaint(result_tensor, original_image_tensor, original_mask, kernel=10, sigma=10)[0]
+        final_image = result_tensor
+        return (final_image, debug_image, original_mask)
+    def object_removal(self,
+                       image, positive_prompt,
+                       remove_mask,
+                       local_strength,
+                       seed, steps, cfg):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        original_image_tensor = image.clone()
+        original_mask = remove_mask
+        original_mask = self._expand_mask(remove_mask, expand=25)
+        image_pil = self._tensor_to_pil(image)
+        # image_pil.save("image_pil.png")
+        # Prepare spatial image: original masked to black in the remove area
+        spatial_pil = self._apply_black_mask(image, original_mask)
+        # spatial_pil.save("spatial_pil.png")
+        # Note: mask is not passed to pipeline; we use it only for blending
+        control_dict = {
+            "type": "removal",
+            "spatial_images": [spatial_pil],
+            "gammas": [local_strength]
+        }
+        result_pil = self.pipe(
+            prompt=positive_prompt,
+            image=image_pil,
+            height=image_pil.height,
+            width=image_pil.width,
+            guidance_scale=cfg,
+            num_inference_steps=steps,
+            generator=generator,
+            control_dict=control_dict,
+        ).images[0]
+        self.clear_cache()
+        result_tensor = self._pil_to_tensor(result_pil)
+        final_image = self.blender.blend_inpaint(result_tensor, original_image_tensor, original_mask, kernel=10, sigma=10)[0]
+        # final_image = result_tensor
+        return (final_image, self._pil_to_tensor(spatial_pil), original_mask)
+    def local_edit(self,
+                   image, positive_prompt, fill_mask, local_strength,
+                   seed, steps, cfg):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        original_image_tensor = image.clone()
+        original_mask = self._expand_mask(fill_mask, expand=25)
+        image_pil = self._tensor_to_pil(image)
+        # image_pil.save("image_pil.png")
+        spatial_pil = self._apply_black_mask(image, original_mask)
+        # spatial_pil.save("spatial_pil.png")
+        control_dict = {
+            "type": "local",
+            "spatial_images": [spatial_pil],
+            "gammas": [local_strength]
+        }
+        result_pil = self.pipe(
+            prompt=positive_prompt,
+            image=image_pil,
+            height=image_pil.height,
+            width=image_pil.width,
+            guidance_scale=cfg,
+            num_inference_steps=steps,
+            generator=generator,
+            max_sequence_length=128,
+            control_dict=control_dict,
+        ).images[0]
+        self.clear_cache()
+        result_tensor = self._pil_to_tensor(result_pil)
+        final_image = self.blender.blend_inpaint(result_tensor, original_image_tensor, original_mask, kernel=10, sigma=10)[0]
+        # final_image = result_tensor
+        return (final_image, self._pil_to_tensor(spatial_pil), original_mask)
+    def foreground_edit(self,
+                        merged_image, positive_prompt,
+                        add_prop_mask, fill_mask, fix_perspective, grow_size,
+                        seed, steps, cfg):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        edit_mask = torch.clamp(self._expand_mask(add_prop_mask, expand=grow_size) + fill_mask, 0.0, 1.0)
+        final_mask = self._expand_mask(edit_mask, expand=25)
+        if fix_perspective == "enable":
+            positive_prompt = positive_prompt + " Fix the perspective if necessary."
+        # Prepare edited input image: inside edit_mask but outside add_prop_mask set to white
+        img = merged_image.clone()
+        base_mask = (edit_mask > 0.5)
+        add_only = (add_prop_mask <= 0.5) & base_mask  # [1, H, W] bool
+        add_only_3 = add_only.squeeze(0).unsqueeze(-1).expand(-1, -1, img.shape[-1])  # [H, W, 3]
+        img[0] = torch.where(add_only_3, torch.ones_like(img[0]), img[0])
+        image_pil = self._tensor_to_pil(img)
+        # image_pil.save("image_pil.png")
+        # Enable aux LoRA only for foreground
+        self._enable_aux_lora()
+        result_pil = self.pipe(
+            prompt=positive_prompt,
+            image=image_pil,
+            height=image_pil.height,
+            width=image_pil.width,
+            guidance_scale=cfg,
+            num_inference_steps=steps,
+            generator=generator,
+            max_sequence_length=128,
+            control_dict=None,
+        ).images[0]
+        # Disable aux LoRA afterwards
+        self._disable_aux_lora()
+        self.clear_cache()
+        final_image = self._pil_to_tensor(result_pil)
+        # final_image = self.blender.blend_inpaint(final_image, img, final_mask, kernel=10, sigma=10)[0]
+        return (final_image, self._pil_to_tensor(image_pil), edit_mask)
+    def kontext_edit(self,
+                     image, positive_prompt,
+                     seed, steps, cfg):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        image_pil = self._tensor_to_pil(image)
+        result_pil = self.pipe(
+            prompt=positive_prompt,
+            image=image_pil,
+            height=image_pil.height,
+            width=image_pil.width,
+            guidance_scale=cfg,
+            num_inference_steps=steps,
+            generator=generator,
+            max_sequence_length=128,
+            control_dict=None,
+        ).images[0]
+        final_image = self._pil_to_tensor(result_pil)
+        mask = torch.zeros((1, final_image.shape[1], final_image.shape[2]), dtype=torch.float32, device=final_image.device)
+        return (final_image, image, mask)
+    def process(self, image, colored_image,
+                 merged_image, positive_prompt,
+                total_mask, add_mask, remove_mask, add_prop_mask, fill_mask,
+                fine_edge, fix_perspective, edge_strength, color_strength, local_strength, grow_size,
+                seed, steps, cfg, flag="precise_edit"):
+        if flag == "foreground":
+            return self.foreground_edit(merged_image, positive_prompt, add_prop_mask, fill_mask, fix_perspective, grow_size, seed, steps, cfg)
+        elif flag == "local":
+            return self.local_edit(image, positive_prompt, fill_mask, local_strength, seed, steps, cfg)
+        elif flag == "removal":
+            return self.object_removal(image, positive_prompt, remove_mask, local_strength, seed, steps, cfg)
+        elif flag == "precise_edit":
+            return self.edge_edit(
+                image, colored_image, positive_prompt,
+                total_mask, add_mask, remove_mask,
+                fine_edge,
+                edge_strength, color_strength,
+                local_strength,
+                seed, steps, cfg
+            )
+        elif flag == "kontext":
+            return self.kontext_edit(image, positive_prompt, seed, steps, cfg)
+        else:
+            raise ValueError("Invalid Editing Type: {}".format(flag))

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+accelerate
+datasets
+diffusers
+easydict
+einops
+fastapi
+gradio==5.4.0
+gradio_client
+huggingface_hub
+numpy
+opencv-python
+peft
+pillow
+protobuf
+requests
+safetensors
+scikit-image
+scipy
+git+https://github.com/facebookresearch/segment-anything.git
+sentencepiece
+spaces
+torch
+torchaudio
+torchvision
+tqdm
+transformers
+uvicorn
+./gradio_magicquillv2-0.0.1-py3-none-any.whl

src/__init__.py ADDED Viewed

File without changes

src/layers_cache.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union, Any, Dict
+from einops import rearrange
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch import Tensor
+from diffusers.models.attention_processor import Attention
+TXTLEN = 128
+KONTEXT = False
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        cond_widths: Optional[List[int]] = None,
+        cond_heights: Optional[List[int]] = None,
+        lora_index: int = 0,
+        n_loras: int = 1,
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+        self.cond_heights = cond_heights if cond_heights is not None else [512]
+        self.cond_widths = cond_widths if cond_widths is not None else [512]
+        self.lora_index = lora_index
+        self.n_loras = n_loras
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        batch_size = hidden_states.shape[0]
+        cond_sizes = [(w // 8 * h // 8 * 16 // 64) for w, h in zip(self.cond_widths, self.cond_heights)]
+        total_cond_size = sum(cond_sizes)
+        block_size = hidden_states.shape[1] - total_cond_size
+        offset = sum(cond_sizes[:self.lora_index])
+        current_cond_size = cond_sizes[self.lora_index]
+        shape = (batch_size, hidden_states.shape[1], 3072)
+        mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
+        mask[:, :block_size + offset, :] = 0
+        mask[:, block_size + offset + current_cond_size:, :] = 0
+        hidden_states = mask * hidden_states
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class MultiSingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks: List[int], lora_weights: List[float], network_alphas: List[float], device=None, dtype=None, cond_widths: Optional[List[int]] = None, cond_heights: Optional[List[int]] = None, n_loras=1):
+        super().__init__()
+        self.n_loras = n_loras
+        self.cond_widths = cond_widths if cond_widths is not None else [512]
+        self.cond_heights = cond_heights if cond_heights is not None else [512]
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv: List[torch.Tensor] = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        scaled_seq_len = hidden_states.shape[1]
+        cond_sizes = [(w // 8 * h // 8 * 16 // 64) for w, h in zip(self.cond_widths, self.cond_heights)]
+        total_cond_size = sum(cond_sizes)
+        block_size = scaled_seq_len - total_cond_size
+        scaled_cond_sizes = cond_sizes
+        scaled_block_size = block_size
+        global TXTLEN
+        global KONTEXT
+        if KONTEXT:
+            img_start, img_end = TXTLEN, (TXTLEN + block_size) // 2
+        else:
+            img_start, img_end = TXTLEN, block_size
+        cond_start, cond_end = block_size, scaled_seq_len
+        cache = len(self.bank_kv) == 0
+        if cache:
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+                key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+                value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.extend([key[:, :, scaled_block_size:, :], value[:, :, scaled_block_size:, :]])
+            if attn.norm_q is not None: query = attn.norm_q(query)
+            if attn.norm_k is not None: key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query, key = apply_rotary_emb(query, image_rotary_emb), apply_rotary_emb(key, image_rotary_emb)
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[ :scaled_block_size, :] = 0
+            current_offset = 0
+            for i in range(self.n_loras):
+                start, end = scaled_block_size + current_offset, scaled_block_size + current_offset + scaled_cond_sizes[i]
+                mask[start:end, start:end] = 0
+                current_offset += scaled_cond_sizes[i]
+            mask *= -1e20
+            c_factor = getattr(self, "c_factor", None)
+            if c_factor is not None:
+                # print(f"Using c_factor: {c_factor}")
+                current_offset = 0
+                for i in range(self.n_loras):
+                    bias = torch.log(c_factor[i])
+                    cond_i_start, cond_i_end = cond_start + current_offset, cond_start + current_offset + scaled_cond_sizes[i]
+                    mask[img_start:img_end, cond_i_start:cond_i_end] = bias
+                    current_offset += scaled_cond_sizes[i]
+            # c_factor_kontext = getattr(self, "c_factor_kontext", None)
+            # if c_factor_kontext is not None:
+            #     bias = torch.log(c_factor_kontext)
+            #     kontext_start, kontext_end = img_end, block_size
+            #     mask[img_start:img_end, kontext_start:kontext_end] = bias
+            #     mask[kontext_start:kontext_end, img_start:img_end] = bias
+            # mask[kontext_start:kontext_end, kontext_end:] = -1e20
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask.to(query.dtype))
+            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]
+        else:
+            query, key, value = attn.to_q(hidden_states), attn.to_k(hidden_states), attn.to_v(hidden_states)
+            inner_dim = query.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = torch.cat([key[:, :, :scaled_block_size, :], self.bank_kv[0]], dim=-2)
+            value = torch.cat([value[:, :, :scaled_block_size, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None: query = attn.norm_q(query)
+            if attn.norm_k is not None: key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query, key = apply_rotary_emb(query, image_rotary_emb), apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_block_size, :]
+            attn_mask = None
+            c_factor = getattr(self, "c_factor", None)
+            if c_factor is not None:
+                # print(f"Using c_factor: {c_factor}")
+                attn_mask = torch.zeros((query.shape[2], key.shape[2]), device=query.device, dtype=query.dtype)
+                current_offset = 0
+                for i in range(self.n_loras):
+                    bias = torch.log(c_factor[i])
+                    cond_i_start, cond_i_end = cond_start + current_offset, cond_start + current_offset + scaled_cond_sizes[i]
+                    attn_mask[img_start:img_end, cond_i_start:cond_i_end] = bias
+                    current_offset += scaled_cond_sizes[i]
+            # c_factor_kontext = getattr(self, "c_factor_kontext", None)
+            # if c_factor_kontext is not None:
+            #     if attn_mask is None:
+            #         attn_mask = torch.zeros((query.shape[2], key.shape[2]), device=query.device, dtype=query.dtype)
+            #     bias = torch.log(c_factor_kontext)
+            #     kontext_start, kontext_end = img_end, block_size
+            #     attn_mask[img_start:img_end, kontext_start:kontext_end] = bias
+            #     attn_mask[kontext_start:kontext_end, img_start:img_end] = bias
+            # attn_mask[kontext_start:kontext_end, kontext_end:] = -1e20
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attn_mask)
+            if self.bank_attn is not None: hidden_states = torch.cat([hidden_states, self.bank_attn], dim=-2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, : block_size,:]
+        return (hidden_states, cond_hidden_states) if use_cond else hidden_states
+class MultiDoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks: List[int], lora_weights: List[float], network_alphas: List[float], device=None, dtype=None, cond_widths: Optional[List[int]] = None, cond_heights: Optional[List[int]] = None, n_loras=1):
+        super().__init__()
+        self.n_loras = n_loras
+        self.cond_widths = cond_widths if cond_widths is not None else [512]
+        self.cond_heights = cond_heights if cond_heights is not None else [512]
+        self.q_loras = nn.ModuleList([LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras) for i in range(n_loras)])
+        self.k_loras = nn.ModuleList([LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras) for i in range(n_loras)])
+        self.v_loras = nn.ModuleList([LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras) for i in range(n_loras)])
+        self.proj_loras = nn.ModuleList([LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype, cond_widths=self.cond_widths, cond_heights=self.cond_heights, lora_index=i, n_loras=n_loras) for i in range(n_loras)])
+        self.lora_weights = lora_weights
+        self.bank_attn = None
+        self.bank_kv: List[torch.Tensor] = []
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        global TXTLEN
+        global KONTEXT
+        TXTLEN = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 128
+        batch_size, _, _ = hidden_states.shape
+        cond_sizes = [(w // 8 * h // 8 * 16 // 64) for w, h in zip(self.cond_widths, self.cond_heights)]
+        block_size = hidden_states.shape[1] - sum(cond_sizes)
+        scaled_seq_len = encoder_hidden_states.shape[1] + hidden_states.shape[1]
+        scaled_cond_sizes = cond_sizes
+        scaled_block_size = scaled_seq_len - sum(scaled_cond_sizes)
+        if KONTEXT:
+            img_start, img_end = TXTLEN, (TXTLEN + block_size) // 2
+        else:
+            img_start, img_end = TXTLEN, block_size
+        cond_start, cond_end = scaled_block_size, scaled_seq_len
+        inner_dim, head_dim = 3072, 3072 // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states).view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states).view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states).view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_added_q is not None: encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None: encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        cache = len(self.bank_kv) == 0
+        if cache:
+            query, key, value = attn.to_q(hidden_states), attn.to_k(hidden_states), attn.to_v(hidden_states)
+            for i in range(self.n_loras):
+                query, key, value = query + self.lora_weights[i] * self.q_loras[i](hidden_states), key + self.lora_weights[i] * self.k_loras[i](hidden_states), value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+            query, key, value = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2), key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2), value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            self.bank_kv.extend([key[:, :, block_size:, :], value[:, :, block_size:, :]])
+            if attn.norm_q is not None: query = attn.norm_q(query)
+            if attn.norm_k is not None: key = attn.norm_k(key)
+            query, key, value = torch.cat([encoder_hidden_states_query_proj, query], dim=2), torch.cat([encoder_hidden_states_key_proj, key], dim=2), torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query, key = apply_rotary_emb(query, image_rotary_emb), apply_rotary_emb(key, image_rotary_emb)
+            mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+            mask[:scaled_block_size, :] = 0
+            current_offset = 0
+            for i in range(self.n_loras):
+                start, end = scaled_block_size + current_offset, scaled_block_size + current_offset + scaled_cond_sizes[i]
+                mask[start:end, start:end] = 0
+                current_offset += scaled_cond_sizes[i]
+            mask *= -1e20
+            c_factor = getattr(self, "c_factor", None)
+            if c_factor is not None:
+                # print(f"Using c_factor: {c_factor}")
+                current_offset = 0
+                for i in range(self.n_loras):
+                    bias = torch.log(c_factor[i])
+                    cond_i_start, cond_i_end = cond_start + current_offset, cond_start + current_offset + scaled_cond_sizes[i]
+                    mask[img_start:img_end, cond_i_start:cond_i_end] = bias
+                    current_offset += scaled_cond_sizes[i]
+            # c_factor_kontext = getattr(self, "c_factor_kontext", None)
+            # if c_factor_kontext is not None:
+            #     bias = torch.log(c_factor_kontext)
+            #     kontext_start, kontext_end = img_end, block_size
+            #     mask[img_start:img_end, kontext_start:kontext_end] = bias
+            #     mask[kontext_start:kontext_end, img_start:img_end] = bias
+            # mask[kontext_start:kontext_end, kontext_end:] = -1e20
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask.to(query.dtype))
+            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]
+        else:
+            query, key, value = attn.to_q(hidden_states), attn.to_k(hidden_states), attn.to_v(hidden_states)
+            query, key, value = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2), key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2), value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key, value = torch.cat([key[:, :, :block_size, :], self.bank_kv[0]], dim=-2), torch.cat([value[:, :, :block_size, :], self.bank_kv[1]], dim=-2)
+            if attn.norm_q is not None: query = attn.norm_q(query)
+            if attn.norm_k is not None: key = attn.norm_k(key)
+            query, key, value = torch.cat([encoder_hidden_states_query_proj, query], dim=2), torch.cat([encoder_hidden_states_key_proj, key], dim=2), torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+            if image_rotary_emb is not None:
+                from diffusers.models.embeddings import apply_rotary_emb
+                query, key = apply_rotary_emb(query, image_rotary_emb), apply_rotary_emb(key, image_rotary_emb)
+            query = query[:, :, :scaled_block_size, :]
+            attn_mask = None
+            c_factor = getattr(self, "c_factor", None)
+            if c_factor is not None:
+                # print(f"Using c_factor: {c_factor}")
+                attn_mask = torch.zeros((query.shape[2], key.shape[2]), device=query.device, dtype=query.dtype)
+                current_offset = 0
+                for i in range(self.n_loras):
+                    bias = torch.log(c_factor[i])
+                    cond_i_start, cond_i_end = cond_start + current_offset, cond_start + current_offset + scaled_cond_sizes[i]
+                    attn_mask[img_start:img_end, cond_i_start:cond_i_end] = bias
+                    current_offset += scaled_cond_sizes[i]
+            # c_factor_kontext = getattr(self, "c_factor_kontext", None)
+            # if c_factor_kontext is not None:
+            #     if attn_mask is None:
+            #         attn_mask = torch.zeros((query.shape[2], key.shape[2]), device=query.device, dtype=query.dtype)
+            #     bias = torch.log(c_factor_kontext)
+            #     kontext_start, kontext_end = img_end, block_size
+            #     attn_mask[img_start:img_end, kontext_start:kontext_end] = bias
+            #     attn_mask[kontext_start:kontext_end, img_start:img_end] = bias
+            # attn_mask[kontext_start:kontext_end, kontext_end:] = -1e20
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attn_mask)
+            if self.bank_attn is not None: hidden_states = torch.cat([hidden_states, self.bank_attn], dim=-2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = hidden_states[:, :encoder_hidden_states.shape[1]], hidden_states[:, encoder_hidden_states.shape[1]:]
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, :block_size,:]
+        return (hidden_states, encoder_hidden_states, cond_hidden_states) if use_cond else (encoder_hidden_states, hidden_states)

src/lora_helper.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from safetensors.torch import load_file
+import re
+import torch
+from .layers_cache import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+device = "cuda"
+def load_safetensors(path):
+    """Safely loads tensors from a file and maps them to the CPU."""
+    return load_file(path, device="cpu")
+def get_lora_count_from_checkpoint(checkpoint):
+    """
+    Infers the number of LoRA modules stored in a checkpoint by inspecting its keys.
+    Also prints a sample of keys for debugging.
+    """
+    lora_indices = set()
+    # Regex to find '..._loras.X.' where X is a number.
+    indexed_pattern = re.compile(r'._loras\.(\d+)\.')
+    found_keys = []
+    for key in checkpoint.keys():
+        match = indexed_pattern.search(key)
+        if match:
+            lora_indices.add(int(match.group(1)))
+            if len(found_keys) < 5 and key not in found_keys:
+                found_keys.append(key)
+    if lora_indices:
+        lora_count = max(lora_indices) + 1
+        print("INFO: Auto-detected indexed LoRA keys in checkpoint.")
+        print(f"      Found {lora_count} LoRA module(s).")
+        print("      Sample keys:", found_keys)
+        return lora_count
+    # Fallback for legacy, non-indexed checkpoints.
+    legacy_found = False
+    legacy_key_sample = ""
+    for key in checkpoint.keys():
+        if '.q_lora.' in key:
+            legacy_found = True
+            legacy_key_sample = key
+            break
+    if legacy_found:
+        print("INFO: Auto-detected legacy (non-indexed) LoRA keys in checkpoint.")
+        print("      Assuming 1 LoRA module.")
+        print("      Sample key:", legacy_key_sample)
+        return 1
+    print("WARNING: No LoRA keys found in the checkpoint.")
+    return 0
+def get_lora_ranks(checkpoint, num_loras):
+    """
+    Determines the rank for each LoRA module from the checkpoint.
+    It supports both indexed (e.g., 'loras.0') and legacy non-indexed formats.
+    """
+    ranks = {}
+    # First, try to find ranks for all indexed LoRA modules.
+    for i in range(num_loras):
+        # Find a key that uniquely identifies the i-th LoRA's down projection.
+        rank_pattern = re.compile(f'._loras\.({i})\.down\.weight')
+        for k, v in checkpoint.items():
+            if rank_pattern.search(k):
+                ranks[i] = v.shape[0]
+                break
+    # If not all ranks were found, there might be legacy keys or a mismatch.
+    if len(ranks) != num_loras:
+        # Fallback for single, non-indexed LoRA checkpoints.
+        if num_loras == 1:
+            for k, v in checkpoint.items():
+                if ".q_lora.down.weight" in k:
+                    return [v.shape[0]]
+        # If still unresolved, use the rank of the very first LoRA found as a default for all.
+        first_found_rank = next((v.shape[0] for k, v in checkpoint.items() if k.endswith(".down.weight")), None)
+        if first_found_rank is None:
+            raise ValueError("Could not determine any LoRA rank from the provided checkpoint.")
+        # Return a list where missing ranks are filled with the first one found.
+        return [ranks.get(i, first_found_rank) for i in range(num_loras)]
+    # Return the list of ranks sorted by LoRA index.
+    return [ranks[i] for i in range(num_loras)]
+def load_checkpoint(local_path):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    return checkpoint
+def prepare_lora_processors(checkpoint, lora_weights, transformer, cond_size, number=None):
+    # Ensure processors match the transformer's device and dtype
+    try:
+        first_param = next(transformer.parameters())
+        target_device = first_param.device
+        target_dtype = first_param.dtype
+    except StopIteration:
+        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        target_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    if number is None:
+        number = get_lora_count_from_checkpoint(checkpoint)
+        if number == 0:
+            return {}
+        if lora_weights and len(lora_weights) != number:
+            print(f"WARNING: Provided `lora_weights` length ({len(lora_weights)}) differs from detected LoRA count ({number}).")
+            final_weights = (lora_weights + [1.0] * number)[:number]
+            print(f"         Adjusting weights to: {final_weights}")
+            lora_weights = final_weights
+        elif not lora_weights:
+            print(f"INFO: No `lora_weights` provided. Defaulting to weights of 1.0 for all {number} LoRAs.")
+            lora_weights = [1.0] * number
+    ranks = get_lora_ranks(checkpoint, number)
+    print("INFO: Determined ranks for LoRA modules:", ranks)
+    cond_widths = cond_size if isinstance(cond_size, list) else [cond_size] * number
+    cond_heights = cond_size if isinstance(cond_size, list) else [cond_size] * number
+    lora_attn_procs = {}
+    double_blocks_idx = list(range(19))
+    single_blocks_idx = list(range(38))
+    # Get all attention processor names from the transformer to iterate over
+    for name in transformer.attn_processors.keys():
+        match = re.search(r'\.(\d+)\.', name)
+        if not match:
+            continue
+        layer_index = int(match.group(1))
+        if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+            lora_state_dicts = {
+                key: value for key, value in checkpoint.items()
+                if f"transformer_blocks.{layer_index}." in key
+            }
+            lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights,
+                device=target_device, dtype=target_dtype, cond_widths=cond_widths, cond_heights=cond_heights, n_loras=number
+            )
+            for n in range(number):
+                lora_prefix_q = f"{name}.q_loras.{n}"
+                lora_prefix_k = f"{name}.k_loras.{n}"
+                lora_prefix_v = f"{name}.v_loras.{n}"
+                lora_prefix_proj = f"{name}.proj_loras.{n}"
+                lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_q}.down.weight')
+                lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_q}.up.weight')
+                lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_k}.down.weight')
+                lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_k}.up.weight')
+                lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_v}.down.weight')
+                lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_v}.up.weight')
+                lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_proj}.down.weight')
+                lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_proj}.up.weight')
+                lora_attn_procs[name].to(device=target_device, dtype=target_dtype)
+        elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+            lora_state_dicts = {
+                key: value for key, value in checkpoint.items()
+                if f"single_transformer_blocks.{layer_index}." in key
+            }
+            lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights,
+                device=target_device, dtype=target_dtype, cond_widths=cond_widths, cond_heights=cond_heights, n_loras=number
+            )
+            for n in range(number):
+                lora_prefix_q = f"{name}.q_loras.{n}"
+                lora_prefix_k = f"{name}.k_loras.{n}"
+                lora_prefix_v = f"{name}.v_loras.{n}"
+                lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_q}.down.weight')
+                lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_q}.up.weight')
+                lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_k}.down.weight')
+                lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_k}.up.weight')
+                lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{lora_prefix_v}.down.weight')
+                lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{lora_prefix_v}.up.weight')
+                lora_attn_procs[name].to(device=target_device, dtype=target_dtype)
+    return lora_attn_procs

src/pipeline_flux_kontext_control.py ADDED Viewed

	@@ -0,0 +1,1230 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils  import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from torchvision.transforms.functional import pad
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from .lora_helper import prepare_lora_processors, load_checkpoint
+from .layers_cache import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+import re
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height, width, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height, device=device)[:, None]  # y
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width, device=device)[None, :]  # x
+    return latent_image_ids
+def prepare_latent_subject_ids(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height, width, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height, device=device)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width, device=device)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def resize_position_encoding(
+    batch_size, original_height, original_width, target_height, target_width, device, dtype
+):
+    latent_image_ids = prepare_latent_image_ids_(original_height // 2, original_width // 2, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    scale_h = original_height / target_height
+    scale_w = original_width / target_width
+    latent_image_ids_resized = torch.zeros(target_height // 2, target_width // 2, 3, device=device, dtype=dtype)
+    latent_image_ids_resized[..., 1] = (
+        latent_image_ids_resized[..., 1] + torch.arange(target_height // 2, device=device)[:, None] * scale_h
+    )
+    latent_image_ids_resized[..., 2] = (
+        latent_image_ids_resized[..., 2] + torch.arange(target_width // 2, device=device)[None, :] * scale_w
+    )
+    cond_latent_image_id_height, cond_latent_image_id_width, cond_latent_image_id_channels = (
+        latent_image_ids_resized.shape
+    )
+    cond_latent_image_ids = latent_image_ids_resized.reshape(
+        cond_latent_image_id_height * cond_latent_image_id_width, cond_latent_image_id_channels
+    )
+    return latent_image_ids, cond_latent_image_ids
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class FluxKontextControlPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    The Flux Kontext pipeline for image-to-image and text-to-image generation with control module.
+    Reference: https://bfl.ai/announcements/flux-1-kontext-dev
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=None,
+            feature_extractor=None,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are packed into 2x2 patches, so use VAE factor multiplied by patch size for image processing
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+        self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
+        self.control_lora_processors: Dict[str, Dict[str, Any]] = {}
+        self.control_lora_cond_sizes: Dict[str, Any] = {}
+        self.control_lora_weights: Dict[str, Any] = {}
+        self.current_control_type: Optional[Union[str, List[str]]] = None
+    def load_control_loras(self, lora_config: Dict[str, Dict[str, Any]]):
+        """
+        Loads and prepares LoRA attention processors for different control types.
+        Args:
+            lora_config: A dict where keys are control types (e.g., 'edge') and values are dicts
+                containing 'path', 'lora_weights', and 'cond_size'.
+        """
+        for control_type, config in lora_config.items():
+            print(f"Loading LoRA for control type: {control_type}")
+            checkpoint = load_checkpoint(config["path"])
+            processors = prepare_lora_processors(
+                checkpoint=checkpoint,
+                lora_weights=config["lora_weights"],
+                transformer=self.transformer,
+                cond_size=config["cond_size"],
+                number=len(config["lora_weights"]) if config.get("lora_weights") is not None else None,
+            )
+            self.control_lora_processors[control_type] = processors
+            self.control_lora_cond_sizes[control_type] = config["cond_size"]
+            self.control_lora_weights[control_type] = config["lora_weights"]
+        print("All control LoRAs loaded and prepared.")
+    def _combine_control_loras(self, control_types: List[str]):
+        """
+        Combines multiple control LoRAs into a single set of attention processors.
+        """
+        if not control_types:
+            return FluxAttnProcessor2_0()
+        try:
+            first_param = next(self.transformer.parameters())
+            target_device = first_param.device
+            target_dtype = first_param.dtype
+        except StopIteration:
+            target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            target_dtype = torch.float32
+        combined_procs = {}
+        # LoRA weights must come from configuration, not from gammas (which control strength)
+        all_lora_weights = []
+        # Determine total number of LoRAs and ranks across all control types
+        total_loras = 0
+        all_ranks = []
+        all_cond_sizes = []
+        for control_type in control_types:
+            procs = self.control_lora_processors.get(control_type)
+            if not procs:
+                raise ValueError(f"Control type '{control_type}' not loaded.")
+            # Collect configured LoRA weights for this control type
+            conf_weights = self.control_lora_weights.get(control_type)
+            if conf_weights is None:
+                raise ValueError(f"Control type '{control_type}' has no configured lora_weights.")
+            all_lora_weights.extend(conf_weights)
+            # Get n_loras from the first processor
+            first_proc = next(iter(procs.values()))
+            n_loras_in_control = first_proc.n_loras
+            total_loras += n_loras_in_control
+            # Correctly get ranks from the processor's LoRA layers
+            proc_ranks = [lora.down.weight.shape[0] for lora in first_proc.q_loras]
+            all_ranks.extend(proc_ranks)
+            cond_size = self.control_lora_cond_sizes[control_type]
+            cond_sizes = [cond_size] * n_loras_in_control if not isinstance(cond_size, list) else cond_size
+            all_cond_sizes.extend(cond_sizes)
+        for name in self.transformer.attn_processors.keys():
+            match = re.search(r'\.(\d+)\.', name)
+            if not match:
+                continue
+            layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks"):
+                new_proc = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=all_ranks, network_alphas=all_ranks, lora_weights=all_lora_weights,
+                    device=target_device, dtype=target_dtype,
+                    cond_widths=all_cond_sizes, cond_heights=all_cond_sizes, n_loras=total_loras
+                )
+            elif name.startswith("single_transformer_blocks"):
+                new_proc = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=all_ranks, network_alphas=all_ranks, lora_weights=all_lora_weights,
+                    device=target_device, dtype=target_dtype,
+                    cond_widths=all_cond_sizes, cond_heights=all_cond_sizes, n_loras=total_loras
+                )
+            else:
+                continue
+            lora_idx_offset = 0
+            for control_type in control_types:
+                source_proc = self.control_lora_processors[control_type][name]
+                for i in range(source_proc.n_loras):
+                    current_lora_idx = lora_idx_offset + i
+                    # Copy weights for q, k, v, proj
+                    new_proc.q_loras[current_lora_idx].load_state_dict(source_proc.q_loras[i].state_dict())
+                    new_proc.k_loras[current_lora_idx].load_state_dict(source_proc.k_loras[i].state_dict())
+                    new_proc.v_loras[current_lora_idx].load_state_dict(source_proc.v_loras[i].state_dict())
+                    if hasattr(new_proc, 'proj_loras'):
+                        new_proc.proj_loras[current_lora_idx].load_state_dict(source_proc.proj_loras[i].state_dict())
+                lora_idx_offset += source_proc.n_loras
+            combined_procs[name] = new_proc.to(device=target_device, dtype=target_dtype)
+        return combined_procs
+    def set_gamma_values(self, gammas: List[float]):
+        """
+        Set gamma values for bias control modulation on current attention processors and attention modules.
+        """
+        print(f"Setting gamma values to: {gammas}")
+        # Resolve device/dtype robustly from model parameters
+        try:
+            first_param = next(self.transformer.parameters())
+            device = first_param.device
+            dtype = first_param.dtype
+        except StopIteration:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            dtype = torch.float32
+        gamma_tensor = torch.tensor(gammas, device=device, dtype=dtype)
+        for name, attn_processor in self.transformer.attn_processors.items():
+            if hasattr(attn_processor, 'q_loras'):
+                setattr(attn_processor, 'c_factor', gamma_tensor)
+                # print(f"  Set c_factor {gamma_tensor} on processor {name}")
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Adapted from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        image,
+        subject_images,
+        spatial_images,
+        latents=None,
+        cond_size=512,
+        num_subject_images: int = 0,
+        num_spatial_images: int = 0,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        height_cond = 2 * (cond_size // (self.vae_scale_factor * 2))
+        width_cond = 2 * (cond_size // (self.vae_scale_factor * 2))
+        image_latents = image_ids = None
+        image_latent_h = 0  # Initialize to handle case where image is None
+        # Prepare noise latents
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            noise_latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            noise_latents = latents.to(device=device, dtype=dtype)
+        noise_latents = self._pack_latents(noise_latents, batch_size, num_channels_latents, height, width)
+        # print(noise_latents.shape)
+        noise_latent_image_ids, cond_latent_image_ids_resized = resize_position_encoding(
+            batch_size, height, width, height_cond, width_cond, device, dtype
+        )
+        # noise IDs are marked with 0 in the first channel
+        noise_latent_image_ids[..., 0] = 0
+        cond_latents_to_concat = []
+        latents_ids_to_concat = [noise_latent_image_ids]
+        # 1. Prepare `image` (Kontext) latents
+        if image is not None:
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] != self.latent_channels:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            else:
+                image_latents = image
+            image_latent_h, image_latent_w = image_latents.shape[2:]
+            image_latents = self._pack_latents(
+                image_latents, batch_size, num_channels_latents, image_latent_h, image_latent_w
+            )
+            image_ids = self._prepare_latent_image_ids(
+                batch_size, image_latent_h // 2, image_latent_w // 2, device, dtype
+            )
+            image_ids[..., 0] = 1  # Mark as condition
+            latents_ids_to_concat.append(image_ids)
+        # 2. Prepare `subject_images` latents
+        if subject_images is not None and num_subject_images > 0:
+            subject_images = subject_images.to(device=device, dtype=dtype)
+            subject_image_latents = self._encode_vae_image(image=subject_images, generator=generator)
+            subject_latent_h, subject_latent_w = subject_image_latents.shape[2:]
+            subject_latents = self._pack_latents(
+                subject_image_latents, batch_size, num_channels_latents, subject_latent_h, subject_latent_w
+            )
+            latent_subject_ids = prepare_latent_subject_ids(height_cond // 2, width_cond // 2, device, dtype)
+            latent_subject_ids[..., 0] = 1
+            latent_subject_ids[:, 1] += image_latent_h // 2
+            subject_latent_image_ids = torch.cat([latent_subject_ids for _ in range(num_subject_images)], dim=0)
+            cond_latents_to_concat.append(subject_latents)
+            latents_ids_to_concat.append(subject_latent_image_ids)
+        # 3. Prepare `spatial_images` latents
+        if spatial_images is not None and num_spatial_images > 0:
+            spatial_images = spatial_images.to(device=device, dtype=dtype)
+            spatial_image_latents = self._encode_vae_image(image=spatial_images, generator=generator)
+            spatial_latent_h, spatial_latent_w = spatial_image_latents.shape[2:]
+            cond_latents = self._pack_latents(
+                spatial_image_latents, batch_size, num_channels_latents, spatial_latent_h, spatial_latent_w
+            )
+            cond_latent_image_ids_resized[..., 0] = 2  # Mark as condition
+            cond_latent_image_ids = torch.cat(
+                [cond_latent_image_ids_resized for _ in range(num_spatial_images)], dim=0
+            )
+            cond_latents_to_concat.append(cond_latents)
+            latents_ids_to_concat.append(cond_latent_image_ids)
+        cond_latents = torch.cat(cond_latents_to_concat, dim=1) if cond_latents_to_concat else None
+        latent_image_ids = torch.cat(latents_ids_to_concat, dim=0)
+        return noise_latents, image_latents, cond_latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        cond_size: int = 512,
+        control_dict: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+            cond_size (`int`, *optional*, defaults to 512):
+                The size for conditioning images.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # Normalize control_dict to an empty dict so kontext-only inference works without controls
+        control_dict = control_dict or {}
+        spatial_images = control_dict.get("spatial_images", [])
+        num_spatial_images = len(spatial_images)
+        subject_images = control_dict.get("subject_images", [])
+        num_subject_images = len(subject_images)
+        requested_control_type = control_dict.get("type") or None
+        # Normalize to list for unified handling
+        if requested_control_type and isinstance(requested_control_type, str):
+            requested_control_type = [requested_control_type]
+        # Revert to default if no control type is requested and a control is active
+        if not requested_control_type and self.current_control_type:
+            print("Reverting to default attention processors.")
+            self.transformer.set_attn_processor(FluxAttnProcessor2_0())
+            self.current_control_type = None
+        # Switch processors only if the control type(s) have changed
+        elif requested_control_type != self.current_control_type:
+            if requested_control_type:
+                print(f"Switching to LoRA control type(s): {requested_control_type}")
+                processors = self._combine_control_loras(requested_control_type)
+                self.transformer.set_attn_processor(processors)
+                # For cond_size, we assume they are compatible and just use the first one.
+                self.cond_size = self.control_lora_cond_sizes[requested_control_type[0]]
+                self.current_control_type = requested_control_type
+        # Align cond_size to selected control type (if any)
+        if hasattr(self, "cond_size"):
+            selected_cond_size = self.cond_size
+            if isinstance(selected_cond_size, list) and len(selected_cond_size) > 0:
+                cond_size = int(selected_cond_size[0])
+            elif isinstance(selected_cond_size, int):
+                cond_size = selected_cond_size
+        # Set gamma values simply based on provided control_dict['gammas'].
+        if requested_control_type:
+            raw_gammas = control_dict.get("gammas", [])
+            if not isinstance(raw_gammas, list):
+                raw_gammas = [raw_gammas]
+            # flatten one level
+            flattened_gammas: List[float] = []
+            for g in raw_gammas:
+                if isinstance(g, (list, tuple)):
+                    flattened_gammas.extend([float(x) for x in g])
+                else:
+                    flattened_gammas.append(float(g))
+            if len(flattened_gammas) > 0:
+                self.set_gamma_values(flattened_gammas)
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 3. Preprocess images
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            img = image[0] if isinstance(image, list) else image
+            image_height, image_width = self.image_processor.get_default_height_width(img)
+            aspect_ratio = image_width / image_height
+            # Kontext is trained on specific resolutions, using one of them is recommended
+            _, image_width, image_height = min(
+                (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+            )
+            multiple_of = self.vae_scale_factor * 2
+            image_width = image_width // multiple_of * multiple_of
+            image_height = image_height // multiple_of * multiple_of
+            image = self.image_processor.resize(image, image_height, image_width)
+            image = self.image_processor.preprocess(image, image_height, image_width)
+        if len(subject_images) > 0:
+            subject_image_ls = []
+            for subject_image in subject_images:
+                w, h = subject_image.size[:2]
+                scale = cond_size / max(h, w)
+                new_h, new_w = int(h * scale), int(w * scale)
+                subject_image = self.image_processor.preprocess(subject_image, height=new_h, width=new_w)
+                subject_image = subject_image.to(dtype=self.vae.dtype)
+                pad_h = cond_size - subject_image.shape[-2]
+                pad_w = cond_size - subject_image.shape[-1]
+                subject_image = pad(
+                    subject_image, padding=(int(pad_w / 2), int(pad_h / 2), int(pad_w / 2), int(pad_h / 2)), fill=0
+                )
+                subject_image_ls.append(subject_image)
+            subject_images = torch.cat(subject_image_ls, dim=-2)
+        else:
+            subject_images = None
+        if len(spatial_images) > 0:
+            condition_image_ls = []
+            for img in spatial_images:
+                condition_image = self.image_processor.preprocess(img, height=cond_size, width=cond_size)
+                condition_image = condition_image.to(dtype=self.vae.dtype)
+                condition_image_ls.append(condition_image)
+            spatial_images = torch.cat(condition_image_ls, dim=-2)
+        else:
+            spatial_images = None
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, image_latents, cond_latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            image,
+            subject_images,
+            spatial_images,
+            latents,
+            cond_size,
+            num_subject_images=num_subject_images,
+            num_spatial_images=num_spatial_images,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        # sigmas = np.array([1.0000, 0.9836, 0.9660, 0.9471, 0.9266, 0.9045, 0.8805, 0.8543, 0.8257, 0.7942, 0.7595, 0.7210, 0.6780, 0.6297, 0.5751, 0.5128, 0.4412, 0.3579, 0.2598, 0.1425])
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        # K/V Caching
+        for name, attn_processor in self.transformer.attn_processors.items():
+            if hasattr(attn_processor, "bank_kv"):
+                attn_processor.bank_kv.clear()
+            if hasattr(attn_processor, "bank_attn"):
+                attn_processor.bank_attn = None
+        if cond_latents is not None:
+            latent_model_input = latents
+            if image_latents is not None:
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+            print(latent_model_input.shape)
+            warmup_latents = latent_model_input
+            warmup_latent_ids = latent_image_ids
+            t = torch.tensor([timesteps[0]], device=device)
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            _ = self.transformer(
+                hidden_states=warmup_latents,
+                cond_hidden_states=cond_latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=warmup_latent_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+        # 6. Denoising loop
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+                self._current_timestep = t
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    cond_hidden_states=cond_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred[:, : latents.size(1)]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

src/transformer_flux.py ADDED Viewed

	@@ -0,0 +1,608 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        use_cond = cond_hidden_states is not None
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        if use_cond:
+            residual_cond = cond_hidden_states
+            norm_cond_hidden_states, cond_gate = self.norm(cond_hidden_states, emb=cond_temb)
+            mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_hidden_states))
+            norm_hidden_states_concat = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        else:
+            norm_hidden_states_concat = norm_hidden_states
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        if use_cond:
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states_concat,
+                image_rotary_emb=image_rotary_emb,
+                use_cond=use_cond,
+                **joint_attention_kwargs,
+            )
+        else:
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states_concat,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs,
+            )
+        if use_cond:
+            attn_output, cond_attn_output = attn_output
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cond:
+            condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+            cond_gate = cond_gate.unsqueeze(1)
+            condition_latents = cond_gate * self.proj_out(condition_latents)
+            condition_latents = residual_cond + condition_latents
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states, condition_latents if use_cond else None
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        use_cond = cond_hidden_states is not None
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        if use_cond:
+            (
+                norm_cond_hidden_states,
+                cond_gate_msa,
+                cond_shift_mlp,
+                cond_scale_mlp,
+                cond_gate_mlp,
+            ) = self.norm1(cond_hidden_states, emb=cond_temb)
+            norm_hidden_states = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        if use_cond:
+            attention_outputs = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                use_cond=use_cond,
+                **joint_attention_kwargs,
+            )
+        else:
+            attention_outputs = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                **joint_attention_kwargs,
+            )
+        attn_output, context_attn_output = attention_outputs[:2]
+        cond_attn_output = attention_outputs[2] if use_cond else None
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        if use_cond:
+            cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+            cond_hidden_states = cond_hidden_states + cond_attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if use_cond:
+            norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            norm_cond_hidden_states = (
+                norm_cond_hidden_states * (1 + cond_scale_mlp[:, None])
+                + cond_shift_mlp[:, None]
+            )
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if use_cond:
+            cond_ff_output = self.ff(norm_cond_hidden_states)
+            cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+            cond_hidden_states = cond_hidden_states + cond_ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states, cond_hidden_states if use_cond else None
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
+):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        # Make a copy of the processor dictionary to avoid destructive changes to the original.
+        if isinstance(processor, dict):
+            processor = processor.copy()
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if cond_hidden_states is not None:
+            use_condition = True
+        else:
+            use_condition = False
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        if cond_hidden_states is not None:
+            if cond_hidden_states.shape[-1] == self.x_embedder.in_features:
+                cond_hidden_states = self.x_embedder(cond_hidden_states)
+            elif cond_hidden_states.shape[-1] == 64:
+                # 只用前64列权重和bias
+                weight = self.x_embedder.weight[:, :64]  # [inner_dim, 64]
+                bias = self.x_embedder.bias
+                cond_hidden_states = torch.nn.functional.linear(cond_hidden_states, weight, bias)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        cond_temb = (
+            self.time_text_embed(torch.ones_like(timestep) * 0, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(
+                    torch.ones_like(timestep) * 0, guidance, pooled_projections
+                )
+            )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, cond_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

train/default_config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+main_process_port: 14121
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 8
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

train/src/__init__.py ADDED Viewed

File without changes

train/src/condition/edge_extraction.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+import torch
+from torch import nn
+from torch.nn import functional as F
+import os
+from einops import rearrange
+from .util import HWC3, nms, safe_step, resize_image_with_pad, common_input_validate, get_intensity_mask, combine_layers
+from .pidi import pidinet
+from .ted import TED
+from .lineart import Generator as LineartGenerator
+from .informative_drawing import Generator
+from .hed import ControlNetHED_Apache2
+from pathlib import Path
+from skimage import morphology
+import argparse
+from tqdm import tqdm
+PREPROCESSORS_ROOT = os.getenv("PREPROCESSORS_ROOT", os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), "models/preprocessors"))
+class HEDDetector:
+    def __init__(self, netNetwork):
+        self.netNetwork = netNetwork
+        self.device = "cpu"
+    @classmethod
+    def from_pretrained(cls, filename="ControlNetHED.pth"):
+        model_path = os.path.join(PREPROCESSORS_ROOT, filename)
+        netNetwork = ControlNetHED_Apache2()
+        netNetwork.load_state_dict(torch.load(model_path, map_location='cpu'))
+        netNetwork.float().eval()
+        return cls(netNetwork)
+    def to(self, device):
+        self.netNetwork.to(device)
+        self.device = device
+        return self
+    def __call__(self, input_image, detect_resolution=512, safe=False, output_type=None, scribble=True, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        assert input_image.ndim == 3
+        H, W, C = input_image.shape
+        with torch.no_grad():
+            image_hed = torch.from_numpy(input_image).float().to(self.device)
+            image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+            edges = self.netNetwork(image_hed)
+            edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+            edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+            edges = np.stack(edges, axis=2)
+            edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+            if safe:
+                edge = safe_step(edge)
+            edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = edge
+        if scribble:
+            detected_map = nms(detected_map, 127, 3.0)
+            detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+            detected_map[detected_map > 4] = 255
+            detected_map[detected_map < 255] = 0
+        detected_map = HWC3(remove_pad(detected_map))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class CannyDetector:
+    def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        detected_map = cv2.Canny(detected_map, low_threshold, high_threshold)
+        detected_map = HWC3(remove_pad(detected_map))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class PidiNetDetector:
+    def __init__(self, netNetwork):
+        self.netNetwork = netNetwork
+        self.device = "cpu"
+    @classmethod
+    def from_pretrained(cls, filename="table5_pidinet.pth"):
+        model_path = os.path.join(PREPROCESSORS_ROOT, filename)
+        netNetwork = pidinet()
+        netNetwork.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(model_path)['state_dict'].items()})
+        netNetwork.eval()
+        return cls(netNetwork)
+    def to(self, device):
+        self.netNetwork.to(device)
+        self.device = device
+        return self
+    def __call__(self, input_image, detect_resolution=512, safe=False, output_type=None, scribble=True, apply_filter=False, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        detected_map = detected_map[:, :, ::-1].copy()
+        with torch.no_grad():
+            image_pidi = torch.from_numpy(detected_map).float().to(self.device)
+            image_pidi = image_pidi / 255.0
+            image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w')
+            edge = self.netNetwork(image_pidi)[-1]
+            edge = edge.cpu().numpy()
+            if apply_filter:
+                edge = edge > 0.5
+            if safe:
+                edge = safe_step(edge)
+            edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = edge[0, 0]
+        if scribble:
+            detected_map = nms(detected_map, 127, 3.0)
+            detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+            detected_map[detected_map > 4] = 255
+            detected_map[detected_map < 255] = 0
+        detected_map = HWC3(remove_pad(detected_map))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class TEDDetector:
+    def __init__(self, model):
+        self.model = model
+        self.device = "cpu"
+    @classmethod
+    def from_pretrained(cls, filename="7_model.pth"):
+        model_path = os.path.join(PREPROCESSORS_ROOT, filename)
+        model = TED()
+        model.load_state_dict(torch.load(model_path, map_location="cpu"))
+        model.eval()
+        return cls(model)
+    def to(self, device):
+        self.model.to(device)
+        self.device = device
+        return self
+    def __call__(self, input_image, detect_resolution=512, safe_steps=2, upscale_method="INTER_CUBIC", output_type=None, **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        H, W, _ = input_image.shape
+        with torch.no_grad():
+            image_teed = torch.from_numpy(input_image.copy()).float().to(self.device)
+            image_teed = rearrange(image_teed, 'h w c -> 1 c h w')
+            edges = self.model(image_teed)
+            edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+            edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+            edges = np.stack(edges, axis=2)
+            edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+            if safe_steps != 0:
+                edge = safe_step(edge, safe_steps)
+            edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = remove_pad(HWC3(edge))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map[..., :3])
+        return detected_map
+class LineartStandardDetector:
+    def __call__(self, input_image=None, guassian_sigma=6.0, intensity_threshold=8, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        x = input_image.astype(np.float32)
+        g = cv2.GaussianBlur(x, (0, 0), guassian_sigma)
+        intensity = np.min(g - x, axis=2).clip(0, 255)
+        intensity /= max(16, np.median(intensity[intensity > intensity_threshold]))
+        intensity *= 127
+        detected_map = intensity.clip(0, 255).astype(np.uint8)
+        detected_map = HWC3(remove_pad(detected_map))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class AnyLinePreprocessor:
+    def __init__(self, mteed_model, lineart_standard_detector):
+        self.device = "cpu"
+        self.mteed_model = mteed_model
+        self.lineart_standard_detector = lineart_standard_detector
+    @classmethod
+    def from_pretrained(cls, mteed_filename="MTEED.pth"):
+        mteed_model = TEDDetector.from_pretrained(filename=mteed_filename)
+        lineart_standard_detector = LineartStandardDetector()
+        return cls(mteed_model, lineart_standard_detector)
+    def to(self, device):
+        self.mteed_model.to(device)
+        self.device = device
+        return self
+    def __call__(self, image, resolution=512, lineart_lower_bound=0, lineart_upper_bound=1, object_min_size=36, object_connectivity=1):
+        # Process the image with MTEED model
+        mteed_result = self.mteed_model(image, detect_resolution=resolution)
+        # Process the image with the lineart standard preprocessor
+        lineart_result = self.lineart_standard_detector(image, guassian_sigma=2, intensity_threshold=3, resolution=resolution)
+        _lineart_result  = get_intensity_mask(lineart_result, lower_bound=lineart_lower_bound, upper_bound=lineart_upper_bound)
+        _cleaned = morphology.remove_small_objects(_lineart_result.astype(bool), min_size=object_min_size, connectivity=object_connectivity)
+        _lineart_result = _lineart_result * _cleaned
+        _mteed_result = mteed_result
+        result = combine_layers(_mteed_result, _lineart_result)
+        # print(result.shape)
+        return result
+class LineartDetector:
+    def __init__(self, model, coarse_model):
+        self.model = model
+        self.model_coarse = coarse_model
+        self.device = "cpu"
+    @classmethod
+    def from_pretrained(cls, filename="sk_model.pth", coarse_filename="sk_model2.pth"):
+        model_path = os.path.join(PREPROCESSORS_ROOT, filename)
+        coarse_model_path = os.path.join(PREPROCESSORS_ROOT, coarse_filename)
+        model = LineartGenerator(3, 1, 3)
+        model.load_state_dict(torch.load(model_path, map_location="cpu"))
+        model.eval()
+        coarse_model = LineartGenerator(3, 1, 3)
+        coarse_model.load_state_dict(torch.load(coarse_model_path, map_location="cpu"))
+        coarse_model.eval()
+        return cls(model, coarse_model)
+    def to(self, device):
+        self.model.to(device)
+        self.model_coarse.to(device)
+        self.device = device
+        return self
+    def __call__(self, input_image, coarse=False, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        model = self.model_coarse if coarse else self.model
+        assert detected_map.ndim == 3
+        with torch.no_grad():
+            image = torch.from_numpy(detected_map).float().to(self.device)
+            image = image / 255.0
+            image = rearrange(image, 'h w c -> 1 c h w')
+            line = model(image)[0][0]
+            line = line.cpu().numpy()
+            line = (line * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = HWC3(line)
+        detected_map = remove_pad(255 - detected_map)
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class InformativeDetector:
+    def __init__(self, anime_model, contour_model):
+        self.anime_model = anime_model
+        self.contour_model = contour_model
+        self.device = "cpu"
+    @classmethod
+    def from_pretrained(cls, anime_filename="anime_style.pth", contour_filename="contour_style.pth"):
+        anime_model_path = os.path.join(PREPROCESSORS_ROOT, anime_filename)
+        contour_model_path = os.path.join(PREPROCESSORS_ROOT, contour_filename)
+        # 创建两个Generator模型
+        anime_model = Generator(3, 1, 3)  # input_nc=3, output_nc=1, n_blocks=3
+        anime_model.load_state_dict(torch.load(anime_model_path, map_location="cpu"))
+        anime_model.eval()
+        contour_model = Generator(3, 1, 3)  # input_nc=3, output_nc=1, n_blocks=3
+        contour_model.load_state_dict(torch.load(contour_model_path, map_location="cpu"))
+        contour_model.eval()
+        return cls(anime_model, contour_model)
+    def to(self, device):
+        self.anime_model.to(device)
+        self.contour_model.to(device)
+        self.device = device
+        return self
+    def __call__(self, input_image, style="anime", detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
+        """
+        提取sketch
+        Args:
+            input_image: 输入图像
+            style: "anime" 或 "contour"
+            detect_resolution: 检测分辨率
+            output_type: 输出类型
+            upscale_method: 上采样方法
+        """
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        detected_map, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
+        # 选择模型
+        model = self.anime_model if style == "anime" else self.contour_model
+        assert detected_map.ndim == 3
+        with torch.no_grad():
+            image = torch.from_numpy(detected_map).float().to(self.device)
+            image = image / 255.0
+            # 转换维度 (h, w, c) -> (1, c, h, w)
+            image = image.permute(2, 0, 1).unsqueeze(0)
+            # 生成sketch
+            sketch = model(image)
+            sketch = sketch[0][0]  # 取出第一个batch的第一个通道
+            sketch = sketch.cpu().numpy()
+            sketch = (sketch * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = HWC3(sketch)
+        detected_map = remove_pad(255 - detected_map)  # 反转颜色
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map

train/src/condition/hed.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+import os
+import warnings
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from .util import HWC3, nms, resize_image_with_pad, safe_step, common_input_validate
+class DoubleConvBlock(torch.nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+    def __call__(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+class ControlNetHED_Apache2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+    def __call__(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5

train/src/condition/informative_drawing.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import functools
+from torchvision import models
+from torch.autograd import Variable
+import numpy as np
+import math
+norm_layer = nn.InstanceNorm2d
+class ResidualBlock(nn.Module):
+    def __init__(self, in_features):
+        super(ResidualBlock, self).__init__()
+        conv_block = [  nn.ReflectionPad2d(1),
+                        nn.Conv2d(in_features, in_features, 3),
+                        norm_layer(in_features),
+                        nn.ReLU(inplace=True),
+                        nn.ReflectionPad2d(1),
+                        nn.Conv2d(in_features, in_features, 3),
+                        norm_layer(in_features)
+                        ]
+        self.conv_block = nn.Sequential(*conv_block)
+    def forward(self, x):
+        return x + self.conv_block(x)
+class Generator(nn.Module):
+    def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True):
+        super(Generator, self).__init__()
+        # Initial convolution block
+        model0 = [   nn.ReflectionPad2d(3),
+                    nn.Conv2d(input_nc, 64, 7),
+                    norm_layer(64),
+                    nn.ReLU(inplace=True) ]
+        self.model0 = nn.Sequential(*model0)
+        # Downsampling
+        model1 = []
+        in_features = 64
+        out_features = in_features*2
+        for _ in range(2):
+            model1 += [  nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
+                        norm_layer(out_features),
+                        nn.ReLU(inplace=True) ]
+            in_features = out_features
+            out_features = in_features*2
+        self.model1 = nn.Sequential(*model1)
+        model2 = []
+        # Residual blocks
+        for _ in range(n_residual_blocks):
+            model2 += [ResidualBlock(in_features)]
+        self.model2 = nn.Sequential(*model2)
+        # Upsampling
+        model3 = []
+        out_features = in_features//2
+        for _ in range(2):
+            model3 += [  nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
+                        norm_layer(out_features),
+                        nn.ReLU(inplace=True) ]
+            in_features = out_features
+            out_features = in_features//2
+        self.model3 = nn.Sequential(*model3)
+        # Output layer
+        model4 = [  nn.ReflectionPad2d(3),
+                        nn.Conv2d(64, output_nc, 7)]
+        if sigmoid:
+            model4 += [nn.Sigmoid()]
+        self.model4 = nn.Sequential(*model4)
+    def forward(self, x, cond=None):
+        out = self.model0(x)
+        out = self.model1(out)
+        out = self.model2(out)
+        out = self.model3(out)
+        out = self.model4(out)
+        return out
+# Define a resnet block
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False):
+        super(ResnetBlock, self).__init__()
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout)
+    def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout):
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        out = x + self.conv_block(x)
+        return out
+class GlobalGenerator2(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', use_sig=False, n_UPsampling=0):
+        assert(n_blocks >= 0)
+        super(GlobalGenerator2, self).__init__()
+        activation = nn.ReLU(True)
+        mult = 8
+        model = [nn.ReflectionPad2d(4), nn.Conv2d(input_nc, ngf*mult, kernel_size=7, padding=0), norm_layer(ngf*mult), activation]
+        ### downsample
+        for i in range(n_downsampling):
+            model += [nn.ConvTranspose2d(ngf * mult, ngf * mult // 2, kernel_size=4, stride=2, padding=1),
+                      norm_layer(ngf * mult // 2), activation]
+            mult = mult // 2
+        if n_UPsampling <= 0:
+            n_UPsampling = n_downsampling
+        ### resnet blocks
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer)]
+        ### upsample
+        for i in range(n_UPsampling):
+            next_mult = mult // 2
+            if next_mult == 0:
+                next_mult = 1
+                mult = 1
+            model += [nn.ConvTranspose2d(ngf * mult, int(ngf * next_mult), kernel_size=3, stride=2, padding=1, output_padding=1),
+                       norm_layer(int(ngf * next_mult)), activation]
+            mult = next_mult
+        if use_sig:
+            model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0), nn.Sigmoid()]
+        else:
+            model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0), nn.Tanh()]
+        self.model = nn.Sequential(*model)
+    def forward(self, input, cond=None):
+        return self.model(input)
+class InceptionV3(nn.Module): #avg pool
+    def __init__(self, num_classes, isTrain, use_aux=True, pretrain=False, freeze=True, every_feat=False):
+        super(InceptionV3, self).__init__()
+        """ Inception v3 expects (299,299) sized images for training and has auxiliary output
+        """
+        self.every_feat = every_feat
+        self.model_ft = models.inception_v3(pretrained=pretrain)
+        stop = 0
+        if freeze and pretrain:
+            for child in self.model_ft.children():
+                if stop < 17:
+                    for param in child.parameters():
+                        param.requires_grad = False
+                stop += 1
+        num_ftrs = self.model_ft.AuxLogits.fc.in_features #768
+        self.model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
+        # Handle the primary net
+        num_ftrs = self.model_ft.fc.in_features #2048
+        self.model_ft.fc = nn.Linear(num_ftrs,num_classes)
+        self.model_ft.input_size = 299
+        self.isTrain = isTrain
+        self.use_aux = use_aux
+        if self.isTrain:
+            self.model_ft.train()
+        else:
+            self.model_ft.eval()
+    def forward(self, x, cond=None, catch_gates=False):
+        # N x 3 x 299 x 299
+        x = self.model_ft.Conv2d_1a_3x3(x)
+        # N x 32 x 149 x 149
+        x = self.model_ft.Conv2d_2a_3x3(x)
+        # N x 32 x 147 x 147
+        x = self.model_ft.Conv2d_2b_3x3(x)
+        # N x 64 x 147 x 147
+        x = F.max_pool2d(x, kernel_size=3, stride=2)
+        # N x 64 x 73 x 73
+        x = self.model_ft.Conv2d_3b_1x1(x)
+        # N x 80 x 73 x 73
+        x = self.model_ft.Conv2d_4a_3x3(x)
+        # N x 192 x 71 x 71
+        x = F.max_pool2d(x, kernel_size=3, stride=2)
+        # N x 192 x 35 x 35
+        x = self.model_ft.Mixed_5b(x)
+        feat1 = x
+        # N x 256 x 35 x 35
+        x = self.model_ft.Mixed_5c(x)
+        feat11 = x
+        # N x 288 x 35 x 35
+        x = self.model_ft.Mixed_5d(x)
+        feat12 = x
+        # N x 288 x 35 x 35
+        x = self.model_ft.Mixed_6a(x)
+        feat2 = x
+        # N x 768 x 17 x 17
+        x = self.model_ft.Mixed_6b(x)
+        feat21 = x
+        # N x 768 x 17 x 17
+        x = self.model_ft.Mixed_6c(x)
+        feat22 = x
+        # N x 768 x 17 x 17
+        x = self.model_ft.Mixed_6d(x)
+        feat23 = x
+        # N x 768 x 17 x 17
+        x = self.model_ft.Mixed_6e(x)
+        feat3 = x
+        # N x 768 x 17 x 17
+        aux_defined = self.isTrain and self.use_aux
+        if aux_defined:
+            aux = self.model_ft.AuxLogits(x)
+        else:
+            aux = None
+        # N x 768 x 17 x 17
+        x = self.model_ft.Mixed_7a(x)
+        # N x 1280 x 8 x 8
+        x = self.model_ft.Mixed_7b(x)
+        # N x 2048 x 8 x 8
+        x = self.model_ft.Mixed_7c(x)
+        # N x 2048 x 8 x 8
+        # Adaptive average pooling
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        # N x 2048 x 1 x 1
+        feats = F.dropout(x, training=self.isTrain)
+        # N x 2048 x 1 x 1
+        x = torch.flatten(feats, 1)
+        # N x 2048
+        x = self.model_ft.fc(x)
+        # N x 1000 (num_classes)
+        if self.every_feat:
+            # return feat21, feats, x
+            return x, feat21
+        return x, aux

train/src/condition/lineart.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import functools
+from torchvision import models
+from torch.autograd import Variable
+import numpy as np
+import math
+norm_layer = nn.InstanceNorm2d
+class ResidualBlock(nn.Module):
+    def __init__(self, in_features):
+        super(ResidualBlock, self).__init__()
+        conv_block = [  nn.ReflectionPad2d(1),
+                        nn.Conv2d(in_features, in_features, 3),
+                        norm_layer(in_features),
+                        nn.ReLU(inplace=True),
+                        nn.ReflectionPad2d(1),
+                        nn.Conv2d(in_features, in_features, 3),
+                        norm_layer(in_features)
+                        ]
+        self.conv_block = nn.Sequential(*conv_block)
+    def forward(self, x):
+        return x + self.conv_block(x)
+class Generator(nn.Module):
+    def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True):
+        super(Generator, self).__init__()
+        # Initial convolution block
+        model0 = [   nn.ReflectionPad2d(3),
+                    nn.Conv2d(input_nc, 64, 7),
+                    norm_layer(64),
+                    nn.ReLU(inplace=True) ]
+        self.model0 = nn.Sequential(*model0)
+        # Downsampling
+        model1 = []
+        in_features = 64
+        out_features = in_features*2
+        for _ in range(2):
+            model1 += [  nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
+                        norm_layer(out_features),
+                        nn.ReLU(inplace=True) ]
+            in_features = out_features
+            out_features = in_features*2
+        self.model1 = nn.Sequential(*model1)
+        model2 = []
+        # Residual blocks
+        for _ in range(n_residual_blocks):
+            model2 += [ResidualBlock(in_features)]
+        self.model2 = nn.Sequential(*model2)
+        # Upsampling
+        model3 = []
+        out_features = in_features//2
+        for _ in range(2):
+            model3 += [  nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
+                        norm_layer(out_features),
+                        nn.ReLU(inplace=True) ]
+            in_features = out_features
+            out_features = in_features//2
+        self.model3 = nn.Sequential(*model3)
+        # Output layer
+        model4 = [  nn.ReflectionPad2d(3),
+                        nn.Conv2d(64, output_nc, 7)]
+        if sigmoid:
+            model4 += [nn.Sigmoid()]
+        self.model4 = nn.Sequential(*model4)
+    def forward(self, x, cond=None):
+        out = self.model0(x)
+        out = self.model1(out)
+        out = self.model2(out)
+        out = self.model3(out)
+        out = self.model4(out)
+        return out

train/src/condition/pidi.py ADDED Viewed

	@@ -0,0 +1,681 @@

+"""
+Author: Zhuo Su, Wenzhe Liu
+Date: Feb 18, 2021
+"""
+import math
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            if img.dtype == 'float64':
+                img = img.astype('float32')
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
+nets = {
+    'baseline': {
+        'layer0':  'cv',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'cv',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'cv',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'cv',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'c-v15': {
+        'layer0':  'cd',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'cv',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'cv',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'cv',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'a-v15': {
+        'layer0':  'ad',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'cv',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'cv',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'cv',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'r-v15': {
+        'layer0':  'rd',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'cv',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'cv',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'cv',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'cvvv4': {
+        'layer0':  'cd',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'cd',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'cd',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'cd',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'avvv4': {
+        'layer0':  'ad',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'ad',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'ad',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'ad',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'rvvv4': {
+        'layer0':  'rd',
+        'layer1':  'cv',
+        'layer2':  'cv',
+        'layer3':  'cv',
+        'layer4':  'rd',
+        'layer5':  'cv',
+        'layer6':  'cv',
+        'layer7':  'cv',
+        'layer8':  'rd',
+        'layer9':  'cv',
+        'layer10': 'cv',
+        'layer11': 'cv',
+        'layer12': 'rd',
+        'layer13': 'cv',
+        'layer14': 'cv',
+        'layer15': 'cv',
+        },
+    'cccv4': {
+        'layer0':  'cd',
+        'layer1':  'cd',
+        'layer2':  'cd',
+        'layer3':  'cv',
+        'layer4':  'cd',
+        'layer5':  'cd',
+        'layer6':  'cd',
+        'layer7':  'cv',
+        'layer8':  'cd',
+        'layer9':  'cd',
+        'layer10': 'cd',
+        'layer11': 'cv',
+        'layer12': 'cd',
+        'layer13': 'cd',
+        'layer14': 'cd',
+        'layer15': 'cv',
+        },
+    'aaav4': {
+        'layer0':  'ad',
+        'layer1':  'ad',
+        'layer2':  'ad',
+        'layer3':  'cv',
+        'layer4':  'ad',
+        'layer5':  'ad',
+        'layer6':  'ad',
+        'layer7':  'cv',
+        'layer8':  'ad',
+        'layer9':  'ad',
+        'layer10': 'ad',
+        'layer11': 'cv',
+        'layer12': 'ad',
+        'layer13': 'ad',
+        'layer14': 'ad',
+        'layer15': 'cv',
+        },
+    'rrrv4': {
+        'layer0':  'rd',
+        'layer1':  'rd',
+        'layer2':  'rd',
+        'layer3':  'cv',
+        'layer4':  'rd',
+        'layer5':  'rd',
+        'layer6':  'rd',
+        'layer7':  'cv',
+        'layer8':  'rd',
+        'layer9':  'rd',
+        'layer10': 'rd',
+        'layer11': 'cv',
+        'layer12': 'rd',
+        'layer13': 'rd',
+        'layer14': 'rd',
+        'layer15': 'cv',
+        },
+    'c16': {
+        'layer0':  'cd',
+        'layer1':  'cd',
+        'layer2':  'cd',
+        'layer3':  'cd',
+        'layer4':  'cd',
+        'layer5':  'cd',
+        'layer6':  'cd',
+        'layer7':  'cd',
+        'layer8':  'cd',
+        'layer9':  'cd',
+        'layer10': 'cd',
+        'layer11': 'cd',
+        'layer12': 'cd',
+        'layer13': 'cd',
+        'layer14': 'cd',
+        'layer15': 'cd',
+        },
+    'a16': {
+        'layer0':  'ad',
+        'layer1':  'ad',
+        'layer2':  'ad',
+        'layer3':  'ad',
+        'layer4':  'ad',
+        'layer5':  'ad',
+        'layer6':  'ad',
+        'layer7':  'ad',
+        'layer8':  'ad',
+        'layer9':  'ad',
+        'layer10': 'ad',
+        'layer11': 'ad',
+        'layer12': 'ad',
+        'layer13': 'ad',
+        'layer14': 'ad',
+        'layer15': 'ad',
+        },
+    'r16': {
+        'layer0':  'rd',
+        'layer1':  'rd',
+        'layer2':  'rd',
+        'layer3':  'rd',
+        'layer4':  'rd',
+        'layer5':  'rd',
+        'layer6':  'rd',
+        'layer7':  'rd',
+        'layer8':  'rd',
+        'layer9':  'rd',
+        'layer10': 'rd',
+        'layer11': 'rd',
+        'layer12': 'rd',
+        'layer13': 'rd',
+        'layer14': 'rd',
+        'layer15': 'rd',
+        },
+    'carv4': {
+        'layer0':  'cd',
+        'layer1':  'ad',
+        'layer2':  'rd',
+        'layer3':  'cv',
+        'layer4':  'cd',
+        'layer5':  'ad',
+        'layer6':  'rd',
+        'layer7':  'cv',
+        'layer8':  'cd',
+        'layer9':  'ad',
+        'layer10': 'rd',
+        'layer11': 'cv',
+        'layer12': 'cd',
+        'layer13': 'ad',
+        'layer14': 'rd',
+        'layer15': 'cv',
+        },
+    }
+def createConvFunc(op_type):
+    assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type)
+    if op_type == 'cv':
+        return F.conv2d
+    if op_type == 'cd':
+        def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+            assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2'
+            assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3'
+            assert padding == dilation, 'padding for cd_conv set wrong'
+            weights_c = weights.sum(dim=[2, 3], keepdim=True)
+            yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups)
+            y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+            return y - yc
+        return func
+    elif op_type == 'ad':
+        def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+            assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2'
+            assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3'
+            assert padding == dilation, 'padding for ad_conv set wrong'
+            shape = weights.shape
+            weights = weights.view(shape[0], shape[1], -1)
+            weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise
+            y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+            return y
+        return func
+    elif op_type == 'rd':
+        def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+            assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2'
+            assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3'
+            padding = 2 * dilation
+            shape = weights.shape
+            if weights.is_cuda:
+                buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0)
+            else:
+                buffer = torch.zeros(shape[0], shape[1], 5 * 5).to(weights.device)
+            weights = weights.view(shape[0], shape[1], -1)
+            buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:]
+            buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:]
+            buffer[:, :, 12] = 0
+            buffer = buffer.view(shape[0], shape[1], 5, 5)
+            y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+            return y
+        return func
+    else:
+        print('impossible to be here unless you force that')
+        return None
+class Conv2d(nn.Module):
+    def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
+        super(Conv2d, self).__init__()
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+        self.pdc = pdc
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, input):
+        return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+class CSAM(nn.Module):
+    """
+    Compact Spatial Attention Module
+    """
+    def __init__(self, channels):
+        super(CSAM, self).__init__()
+        mid_channels = 4
+        self.relu1 = nn.ReLU()
+        self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False)
+        self.sigmoid = nn.Sigmoid()
+        nn.init.constant_(self.conv1.bias, 0)
+    def forward(self, x):
+        y = self.relu1(x)
+        y = self.conv1(y)
+        y = self.conv2(y)
+        y = self.sigmoid(y)
+        return x * y
+class CDCM(nn.Module):
+    """
+    Compact Dilation Convolution based Module
+    """
+    def __init__(self, in_channels, out_channels):
+        super(CDCM, self).__init__()
+        self.relu1 = nn.ReLU()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+        self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False)
+        self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False)
+        self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False)
+        self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False)
+        nn.init.constant_(self.conv1.bias, 0)
+    def forward(self, x):
+        x = self.relu1(x)
+        x = self.conv1(x)
+        x1 = self.conv2_1(x)
+        x2 = self.conv2_2(x)
+        x3 = self.conv2_3(x)
+        x4 = self.conv2_4(x)
+        return x1 + x2 + x3 + x4
+class MapReduce(nn.Module):
+    """
+    Reduce feature maps into a single edge map
+    """
+    def __init__(self, channels):
+        super(MapReduce, self).__init__()
+        self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0)
+        nn.init.constant_(self.conv.bias, 0)
+    def forward(self, x):
+        return self.conv(x)
+class PDCBlock(nn.Module):
+    def __init__(self, pdc, inplane, ouplane, stride=1):
+        super(PDCBlock, self).__init__()
+        self.stride=stride
+        self.stride=stride
+        if self.stride > 1:
+            self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+            self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+        self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+        self.relu2 = nn.ReLU()
+        self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+    def forward(self, x):
+        if self.stride > 1:
+            x = self.pool(x)
+        y = self.conv1(x)
+        y = self.relu2(y)
+        y = self.conv2(y)
+        if self.stride > 1:
+            x = self.shortcut(x)
+        y = y + x
+        return y
+class PDCBlock_converted(nn.Module):
+    """
+    CPDC, APDC can be converted to vanilla 3x3 convolution
+    RPDC can be converted to vanilla 5x5 convolution
+    """
+    def __init__(self, pdc, inplane, ouplane, stride=1):
+        super(PDCBlock_converted, self).__init__()
+        self.stride=stride
+        if self.stride > 1:
+            self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+            self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+        if pdc == 'rd':
+            self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False)
+        else:
+            self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+        self.relu2 = nn.ReLU()
+        self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+    def forward(self, x):
+        if self.stride > 1:
+            x = self.pool(x)
+        y = self.conv1(x)
+        y = self.relu2(y)
+        y = self.conv2(y)
+        if self.stride > 1:
+            x = self.shortcut(x)
+        y = y + x
+        return y
+class PiDiNet(nn.Module):
+    def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False):
+        super(PiDiNet, self).__init__()
+        self.sa = sa
+        if dil is not None:
+            assert isinstance(dil, int), 'dil should be an int'
+        self.dil = dil
+        self.fuseplanes = []
+        self.inplane = inplane
+        if convert:
+            if pdcs[0] == 'rd':
+                init_kernel_size = 5
+                init_padding = 2
+            else:
+                init_kernel_size = 3
+                init_padding = 1
+            self.init_block = nn.Conv2d(3, self.inplane,
+                    kernel_size=init_kernel_size, padding=init_padding, bias=False)
+            block_class = PDCBlock_converted
+        else:
+            self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1)
+            block_class = PDCBlock
+        self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane)
+        self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane)
+        self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane)
+        self.fuseplanes.append(self.inplane) # C
+        inplane = self.inplane
+        self.inplane = self.inplane * 2
+        self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2)
+        self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane)
+        self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane)
+        self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane)
+        self.fuseplanes.append(self.inplane) # 2C
+        inplane = self.inplane
+        self.inplane = self.inplane * 2
+        self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2)
+        self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane)
+        self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane)
+        self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane)
+        self.fuseplanes.append(self.inplane) # 4C
+        self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2)
+        self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane)
+        self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane)
+        self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane)
+        self.fuseplanes.append(self.inplane) # 4C
+        self.conv_reduces = nn.ModuleList()
+        if self.sa and self.dil is not None:
+            self.attentions = nn.ModuleList()
+            self.dilations = nn.ModuleList()
+            for i in range(4):
+                self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+                self.attentions.append(CSAM(self.dil))
+                self.conv_reduces.append(MapReduce(self.dil))
+        elif self.sa:
+            self.attentions = nn.ModuleList()
+            for i in range(4):
+                self.attentions.append(CSAM(self.fuseplanes[i]))
+                self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+        elif self.dil is not None:
+            self.dilations = nn.ModuleList()
+            for i in range(4):
+                self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+                self.conv_reduces.append(MapReduce(self.dil))
+        else:
+            for i in range(4):
+                self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+        self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias
+        nn.init.constant_(self.classifier.weight, 0.25)
+        nn.init.constant_(self.classifier.bias, 0)
+        # print('initialization done')
+    def get_weights(self):
+        conv_weights = []
+        bn_weights = []
+        relu_weights = []
+        for pname, p in self.named_parameters():
+            if 'bn' in pname:
+                bn_weights.append(p)
+            elif 'relu' in pname:
+                relu_weights.append(p)
+            else:
+                conv_weights.append(p)
+        return conv_weights, bn_weights, relu_weights
+    def forward(self, x):
+        H, W = x.size()[2:]
+        x = self.init_block(x)
+        x1 = self.block1_1(x)
+        x1 = self.block1_2(x1)
+        x1 = self.block1_3(x1)
+        x2 = self.block2_1(x1)
+        x2 = self.block2_2(x2)
+        x2 = self.block2_3(x2)
+        x2 = self.block2_4(x2)
+        x3 = self.block3_1(x2)
+        x3 = self.block3_2(x3)
+        x3 = self.block3_3(x3)
+        x3 = self.block3_4(x3)
+        x4 = self.block4_1(x3)
+        x4 = self.block4_2(x4)
+        x4 = self.block4_3(x4)
+        x4 = self.block4_4(x4)
+        x_fuses = []
+        if self.sa and self.dil is not None:
+            for i, xi in enumerate([x1, x2, x3, x4]):
+                x_fuses.append(self.attentions[i](self.dilations[i](xi)))
+        elif self.sa:
+            for i, xi in enumerate([x1, x2, x3, x4]):
+                x_fuses.append(self.attentions[i](xi))
+        elif self.dil is not None:
+            for i, xi in enumerate([x1, x2, x3, x4]):
+                x_fuses.append(self.dilations[i](xi))
+        else:
+            x_fuses = [x1, x2, x3, x4]
+        e1 = self.conv_reduces[0](x_fuses[0])
+        e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False)
+        e2 = self.conv_reduces[1](x_fuses[1])
+        e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False)
+        e3 = self.conv_reduces[2](x_fuses[2])
+        e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False)
+        e4 = self.conv_reduces[3](x_fuses[3])
+        e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False)
+        outputs = [e1, e2, e3, e4]
+        output = self.classifier(torch.cat(outputs, dim=1))
+        #if not self.training:
+        #    return torch.sigmoid(output)
+        outputs.append(output)
+        outputs = [torch.sigmoid(r) for r in outputs]
+        return outputs
+def config_model(model):
+    model_options = list(nets.keys())
+    assert model in model_options, \
+        'unrecognized model, please choose from %s' % str(model_options)
+    # print(str(nets[model]))
+    pdcs = []
+    for i in range(16):
+        layer_name = 'layer%d' % i
+        op = nets[model][layer_name]
+        pdcs.append(createConvFunc(op))
+    return pdcs
+def pidinet():
+    pdcs = config_model('carv4')
+    dil = 24 #if args.dil else None
+    return PiDiNet(60, pdcs, dil=dil, sa=True)
+if __name__ == '__main__':
+    model = pidinet()
+    ckp = torch.load('table5_pidinet.pth')['state_dict']
+    model.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
+    im = cv2.imread('examples/test_my/cat_v4.png')
+    im = img2tensor(im).unsqueeze(0)/255.
+    res = model(im)[-1]
+    res = res>0.5
+    res = res.float()
+    res = (res[0,0].cpu().data.numpy()*255.).astype(np.uint8)
+    print(res.shape)
+    cv2.imwrite('edge.png', res)

train/src/condition/ted.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# TEED: is a Tiny but Efficient Edge Detection, it comes from the LDC-B3
+# with a Slightly modification
+# LDC parameters:
+# 155665
+# TED > 58K
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .util import smish as Fsmish
+from .util import Smish
+def weight_init(m):
+    if isinstance(m, (nn.Conv2d,)):
+        torch.nn.init.xavier_normal_(m.weight, gain=1.0)
+        if m.bias is not None:
+            torch.nn.init.zeros_(m.bias)
+    # for fusion layer
+    if isinstance(m, (nn.ConvTranspose2d,)):
+        torch.nn.init.xavier_normal_(m.weight, gain=1.0)
+        if m.bias is not None:
+            torch.nn.init.zeros_(m.bias)
+class CoFusion(nn.Module):
+    # from LDC
+    def __init__(self, in_ch, out_ch):
+        super(CoFusion, self).__init__()
+        self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3,
+                               stride=1, padding=1) # before 64
+        self.conv3= nn.Conv2d(32, out_ch, kernel_size=3,
+                               stride=1, padding=1)# before 64  instead of 32
+        self.relu = nn.ReLU()
+        self.norm_layer1 = nn.GroupNorm(4, 32) # before 64
+    def forward(self, x):
+        # fusecat = torch.cat(x, dim=1)
+        attn = self.relu(self.norm_layer1(self.conv1(x)))
+        attn = F.softmax(self.conv3(attn), dim=1)
+        return ((x * attn).sum(1)).unsqueeze(1)
+class CoFusion2(nn.Module):
+        # TEDv14-3
+    def __init__(self, in_ch, out_ch):
+        super(CoFusion2, self).__init__()
+        self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3,
+                               stride=1, padding=1) # before 64
+        # self.conv2 = nn.Conv2d(32, 32, kernel_size=3,
+        #                        stride=1, padding=1)# before 64
+        self.conv3 = nn.Conv2d(32, out_ch, kernel_size=3,
+                               stride=1, padding=1)# before 64  instead of 32
+        self.smish= Smish()#nn.ReLU(inplace=True)
+    def forward(self, x):
+        # fusecat = torch.cat(x, dim=1)
+        attn = self.conv1(self.smish(x))
+        attn = self.conv3(self.smish(attn)) # before , )dim=1)
+        # return ((fusecat * attn).sum(1)).unsqueeze(1)
+        return ((x * attn).sum(1)).unsqueeze(1)
+class DoubleFusion(nn.Module):
+    # TED fusion before the final edge map prediction
+    def __init__(self, in_ch, out_ch):
+        super(DoubleFusion, self).__init__()
+        self.DWconv1 = nn.Conv2d(in_ch, in_ch*8, kernel_size=3,
+                               stride=1, padding=1, groups=in_ch) # before 64
+        self.PSconv1 = nn.PixelShuffle(1)
+        self.DWconv2 = nn.Conv2d(24, 24*1, kernel_size=3,
+                               stride=1, padding=1,groups=24)# before 64  instead of 32
+        self.AF= Smish()#XAF() #nn.Tanh()# XAF() #   # Smish()#
+    def forward(self, x):
+        # fusecat = torch.cat(x, dim=1)
+        attn = self.PSconv1(self.DWconv1(self.AF(x))) # #TEED best res TEDv14 [8, 32, 352, 352]
+        attn2 = self.PSconv1(self.DWconv2(self.AF(attn))) # #TEED best res TEDv14[8, 3, 352, 352]
+        return Fsmish(((attn2 +attn).sum(1)).unsqueeze(1)) #TED best res
+class _DenseLayer(nn.Sequential):
+    def __init__(self, input_features, out_features):
+        super(_DenseLayer, self).__init__()
+        self.add_module('conv1', nn.Conv2d(input_features, out_features,
+                                           kernel_size=3, stride=1, padding=2, bias=True)),
+        self.add_module('smish1', Smish()),
+        self.add_module('conv2', nn.Conv2d(out_features, out_features,
+                                           kernel_size=3, stride=1, bias=True))
+    def forward(self, x):
+        x1, x2 = x
+        new_features = super(_DenseLayer, self).forward(Fsmish(x1))  # F.relu()
+        return 0.5 * (new_features + x2), x2
+class _DenseBlock(nn.Sequential):
+    def __init__(self, num_layers, input_features, out_features):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(input_features, out_features)
+            self.add_module('denselayer%d' % (i + 1), layer)
+            input_features = out_features
+class UpConvBlock(nn.Module):
+    def __init__(self, in_features, up_scale):
+        super(UpConvBlock, self).__init__()
+        self.up_factor = 2
+        self.constant_features = 16
+        layers = self.make_deconv_layers(in_features, up_scale)
+        assert layers is not None, layers
+        self.features = nn.Sequential(*layers)
+    def make_deconv_layers(self, in_features, up_scale):
+        layers = []
+        all_pads=[0,0,1,3,7]
+        for i in range(up_scale):
+            kernel_size = 2 ** up_scale
+            pad = all_pads[up_scale]  # kernel_size-1
+            out_features = self.compute_out_features(i, up_scale)
+            layers.append(nn.Conv2d(in_features, out_features, 1))
+            layers.append(Smish())
+            layers.append(nn.ConvTranspose2d(
+                out_features, out_features, kernel_size, stride=2, padding=pad))
+            in_features = out_features
+        return layers
+    def compute_out_features(self, idx, up_scale):
+        return 1 if idx == up_scale - 1 else self.constant_features
+    def forward(self, x):
+        return self.features(x)
+class SingleConvBlock(nn.Module):
+    def __init__(self, in_features, out_features, stride, use_ac=False):
+        super(SingleConvBlock, self).__init__()
+        # self.use_bn = use_bs
+        self.use_ac=use_ac
+        self.conv = nn.Conv2d(in_features, out_features, 1, stride=stride,
+                              bias=True)
+        if self.use_ac:
+            self.smish = Smish()
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_ac:
+            return self.smish(x)
+        else:
+            return x
+class DoubleConvBlock(nn.Module):
+    def __init__(self, in_features, mid_features,
+                 out_features=None,
+                 stride=1,
+                 use_act=True):
+        super(DoubleConvBlock, self).__init__()
+        self.use_act = use_act
+        if out_features is None:
+            out_features = mid_features
+        self.conv1 = nn.Conv2d(in_features, mid_features,
+                               3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(mid_features, out_features, 3, padding=1)
+        self.smish= Smish()#nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.smish(x)
+        x = self.conv2(x)
+        if self.use_act:
+            x = self.smish(x)
+        return x
+class TED(nn.Module):
+    """ Definition of  Tiny and Efficient Edge Detector
+    model
+    """
+    def __init__(self):
+        super(TED, self).__init__()
+        self.block_1 = DoubleConvBlock(3, 16, 16, stride=2,)
+        self.block_2 = DoubleConvBlock(16, 32, use_act=False)
+        self.dblock_3 = _DenseBlock(1, 32, 48) # [32,48,100,100] before (2, 32, 64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        # skip1 connection, see fig. 2
+        self.side_1 = SingleConvBlock(16, 32, 2)
+        # skip2 connection, see fig. 2
+        self.pre_dense_3 = SingleConvBlock(32, 48, 1)  # before (32, 64, 1)
+        # USNet
+        self.up_block_1 = UpConvBlock(16, 1)
+        self.up_block_2 = UpConvBlock(32, 1)
+        self.up_block_3 = UpConvBlock(48, 2) # (32, 64, 1)
+        self.block_cat = DoubleFusion(3,3) # TEED: DoubleFusion
+        self.apply(weight_init)
+    def slice(self, tensor, slice_shape):
+        t_shape = tensor.shape
+        img_h, img_w = slice_shape
+        if img_w!=t_shape[-1] or img_h!=t_shape[2]:
+            new_tensor = F.interpolate(
+                tensor, size=(img_h, img_w), mode='bicubic',align_corners=False)
+        else:
+            new_tensor=tensor
+        # tensor[..., :height, :width]
+        return new_tensor
+    def resize_input(self,tensor):
+        t_shape = tensor.shape
+        if t_shape[2] % 8 != 0 or t_shape[3] % 8 != 0:
+            img_w= ((t_shape[3]// 8) + 1) * 8
+            img_h = ((t_shape[2] // 8) + 1) * 8
+            new_tensor = F.interpolate(
+                tensor, size=(img_h, img_w), mode='bicubic', align_corners=False)
+        else:
+            new_tensor = tensor
+        return new_tensor
+    def crop_bdcn(data1, h, w, crop_h, crop_w):
+        # Based on BDCN Implementation @ https://github.com/pkuCactus/BDCN
+        _, _, h1, w1 = data1.size()
+        assert (h <= h1 and w <= w1)
+        data = data1[:, :, crop_h:crop_h + h, crop_w:crop_w + w]
+        return data
+    def forward(self, x, single_test=False):
+        assert x.ndim == 4, x.shape
+         # supose the image size is 352x352
+        # Block 1
+        block_1 = self.block_1(x) # [8,16,176,176]
+        block_1_side = self.side_1(block_1) # 16 [8,32,88,88]
+        # Block 2
+        block_2 = self.block_2(block_1) # 32 # [8,32,176,176]
+        block_2_down = self.maxpool(block_2) # [8,32,88,88]
+        block_2_add = block_2_down + block_1_side # [8,32,88,88]
+        # Block 3
+        block_3_pre_dense = self.pre_dense_3(block_2_down) # [8,64,88,88] block 3 L connection
+        block_3, _ = self.dblock_3([block_2_add, block_3_pre_dense]) # [8,64,88,88]
+        # upsampling blocks
+        out_1 = self.up_block_1(block_1)
+        out_2 = self.up_block_2(block_2)
+        out_3 = self.up_block_3(block_3)
+        results = [out_1, out_2, out_3]
+        # concatenate multiscale outputs
+        block_cat = torch.cat(results, dim=1)  # Bx6xHxW
+        block_cat = self.block_cat(block_cat)  # Bx1xHxW DoubleFusion
+        results.append(block_cat)
+        return results
+if __name__ == '__main__':
+    batch_size = 8
+    img_height = 352
+    img_width = 352
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = "cpu"
+    input = torch.rand(batch_size, 3, img_height, img_width).to(device)
+    # target = torch.rand(batch_size, 1, img_height, img_width).to(device)
+    print(f"input shape: {input.shape}")
+    model = TED().to(device)
+    output = model(input)
+    print(f"output shapes: {[t.shape for t in output]}")
+    # for i in range(20000):
+    #     print(i)
+    #     output = model(input)
+    #     loss = nn.MSELoss()(output[-1], target)
+    #     loss.backward()

train/src/condition/util.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import random
+import tempfile
+import warnings
+from contextlib import suppress
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from huggingface_hub import constants, hf_hub_download
+from torch.hub import get_dir, download_url_to_file
+from ast import literal_eval
+import torch.nn.functional as F
+import torch.nn as nn
+def safe_step(x, step=2):
+    y = x.astype(np.float32) * float(step + 1)
+    y = y.astype(np.int32).astype(np.float32) / float(step)
+    return y
+def nms(x, t, s):
+    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+    y = np.zeros_like(x)
+    for f in [f1, f2, f3, f4]:
+        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+    z = np.zeros_like(y, dtype=np.uint8)
+    z[y > t] = 255
+    return z
+def safer_memory(x):
+    # Fix many MAC/AMD problems
+    return np.ascontiguousarray(x.copy()).copy()
+UPSCALE_METHODS = ["INTER_NEAREST", "INTER_LINEAR", "INTER_AREA", "INTER_CUBIC", "INTER_LANCZOS4"]
+def get_upscale_method(method_str):
+    assert method_str in UPSCALE_METHODS, f"Method {method_str} not found in {UPSCALE_METHODS}"
+    return getattr(cv2, method_str)
+def pad64(x):
+    return int(np.ceil(float(x) / 64.0) * 64 - x)
+def resize_image_with_pad(input_image, resolution, upscale_method = "", skip_hwc3=False, mode='edge'):
+    if skip_hwc3:
+        img = input_image
+    else:
+        img = HWC3(input_image)
+    H_raw, W_raw, _ = img.shape
+    if resolution == 0:
+        return img, lambda x: x
+    k = float(resolution) / float(min(H_raw, W_raw))
+    H_target = int(np.round(float(H_raw) * k))
+    W_target = int(np.round(float(W_raw) * k))
+    img = cv2.resize(img, (W_target, H_target), interpolation=get_upscale_method(upscale_method) if k > 1 else cv2.INTER_AREA)
+    H_pad, W_pad = pad64(H_target), pad64(W_target)
+    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)
+    def remove_pad(x):
+        return safer_memory(x[:H_target, :W_target, ...])
+    return safer_memory(img_padded), remove_pad
+def common_input_validate(input_image, output_type, **kwargs):
+    if "img" in kwargs:
+            warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+            input_image = kwargs.pop("img")
+    if "return_pil" in kwargs:
+            warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
+            output_type = "pil" if kwargs["return_pil"] else "np"
+    if type(output_type) is bool:
+        warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
+        if output_type:
+            output_type = "pil"
+    if input_image is None:
+        raise ValueError("input_image must be defined.")
+    if not isinstance(input_image, np.ndarray):
+        input_image = np.array(input_image, dtype=np.uint8)
+        output_type = output_type or "pil"
+    else:
+        output_type = output_type or "np"
+    return (input_image, output_type)
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+def get_intensity_mask(image_array, lower_bound, upper_bound):
+    mask = image_array[:, :, 0]
+    mask = np.where((mask >= lower_bound) & (mask <= upper_bound), mask, 0)
+    mask = np.expand_dims(mask, 2).repeat(3, axis=2)
+    return mask
+def combine_layers(base_layer, top_layer):
+    mask = top_layer.astype(bool)
+    temp = 1 - (1 - top_layer) * (1 - base_layer)
+    result = base_layer * (~mask) + temp * mask
+    return result
+@torch.jit.script
+def mish(input):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    See additional documentation for mish class.
+    """
+    return input * torch.tanh(F.softplus(input))
+@torch.jit.script
+def smish(input):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(sigmoid(x))))
+    See additional documentation for mish class.
+    """
+    return input * torch.tanh(torch.log(1+torch.sigmoid(input)))
+class Mish(nn.Module):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+    Examples:
+        >>> m = Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html
+    """
+    def __init__(self):
+        """
+        Init method.
+        """
+        super().__init__()
+    def forward(self, input):
+        """
+        Forward pass of the function.
+        """
+        if torch.__version__ >= "1.9":
+            return F.mish(input)
+        else:
+            return mish(input)
+class Smish(nn.Module):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+    Examples:
+        >>> m = Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html
+    """
+    def __init__(self):
+        """
+        Init method.
+        """
+        super().__init__()
+    def forward(self, input):
+        """
+        Forward pass of the function.
+        """
+        return smish(input)

train/src/generate_diff_mask.py ADDED Viewed

	@@ -0,0 +1,301 @@

+#!/usr/bin/env python3
+"""
+Standalone script: Given two images, generate a final difference mask using the
+same pipeline as visualize_mask_diff (without any visualization output).
+Pipeline:
+1) Align images to a preferred resolution/crop so they share the same size.
+2) Pixel-diff screening across parameter combinations; skip if any hull ratio is
+   outside [hull_min_allowed, hull_max_allowed].
+3) Color-diff to produce the final mask; remove small areas and re-check hull
+   ratio. Save final mask to output path.
+"""
+import os
+import json
+import argparse
+from typing import Tuple, Optional
+import numpy as np
+from PIL import Image
+import cv2
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568), (688, 1504), (720, 1456), (752, 1392), (800, 1328),
+    (832, 1248), (880, 1184), (944, 1104), (1024, 1024), (1104, 944),
+    (1184, 880), (1248, 832), (1328, 800), (1392, 752), (1456, 720),
+    (1504, 688), (1568, 672),
+]
+def choose_preferred_resolution(image_width: int, image_height: int) -> Tuple[int, int]:
+    aspect_ratio = image_width / max(1, image_height)
+    best = min(((abs(aspect_ratio - (w / h)), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS), key=lambda x: x[0])
+    _, w_best, h_best = best
+    return int(w_best), int(h_best)
+def align_images(source_path: str, target_path: str) -> Tuple[Image.Image, Image.Image]:
+    source_img = Image.open(source_path).convert("RGB")
+    target_img = Image.open(target_path).convert("RGB")
+    pref_w, pref_h = choose_preferred_resolution(source_img.width, source_img.height)
+    source_resized = source_img.resize((pref_w, pref_h), Image.Resampling.LANCZOS)
+    tgt_w, tgt_h = target_img.width, target_img.height
+    crop_w = min(source_resized.width, tgt_w)
+    crop_h = min(source_resized.height, tgt_h)
+    source_aligned = source_resized.crop((0, 0, crop_w, crop_h))
+    target_aligned = target_img.crop((0, 0, crop_w, crop_h))
+    return source_aligned, target_aligned
+def pil_to_cv_gray(img: Image.Image) -> np.ndarray:
+    bgr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
+    return gray
+def generate_pixel_diff_mask(img1: Image.Image, img2: Image.Image, threshold: Optional[int] = None, clean_kernel_size: Optional[int] = 11) -> np.ndarray:
+    img1_gray = pil_to_cv_gray(img1)
+    img2_gray = pil_to_cv_gray(img2)
+    diff = cv2.absdiff(img1_gray, img2_gray)
+    if threshold is None:
+        mask = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+    else:
+        mask = cv2.threshold(diff, int(threshold), 255, cv2.THRESH_BINARY)[1]
+    if clean_kernel_size and clean_kernel_size > 0:
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (clean_kernel_size, clean_kernel_size))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+    return mask
+def generate_color_diff_mask(img1: Image.Image, img2: Image.Image, threshold: Optional[int] = None, clean_kernel_size: Optional[int] = 21) -> np.ndarray:
+    bgr1 = cv2.cvtColor(np.array(img1), cv2.COLOR_RGB2BGR)
+    bgr2 = cv2.cvtColor(np.array(img2), cv2.COLOR_RGB2BGR)
+    lab1 = cv2.cvtColor(bgr1, cv2.COLOR_BGR2LAB).astype("float32")
+    lab2 = cv2.cvtColor(bgr2, cv2.COLOR_BGR2LAB).astype("float32")
+    diff = lab1 - lab2
+    dist = np.sqrt(np.sum(diff * diff, axis=2))
+    dist_u8 = cv2.normalize(dist, None, 0, 255, cv2.NORM_MINMAX).astype("uint8")
+    if threshold is None:
+        mask = cv2.threshold(dist_u8, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+    else:
+        mask = cv2.threshold(dist_u8, int(threshold), 255, cv2.THRESH_BINARY)[1]
+    if clean_kernel_size and clean_kernel_size > 0:
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (clean_kernel_size, clean_kernel_size))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+    return mask
+def compute_unified_contour(mask_bin: np.ndarray, contours: list, min_area: int = 40, method: str = "morph", morph_kernel: int = 15, morph_iters: int = 1, approx_epsilon_ratio: float = 0.01):
+    valid_cnts = []
+    for c in contours:
+        if cv2.contourArea(c) >= max(1, min_area):
+            valid_cnts.append(c)
+    if not valid_cnts:
+        return None
+    if method == "convex_hull":
+        all_points = np.vstack(valid_cnts)
+        hull = cv2.convexHull(all_points)
+        epsilon = approx_epsilon_ratio * cv2.arcLength(hull, True)
+        unified = cv2.approxPolyDP(hull, epsilon, True)
+        return unified
+    union = np.zeros_like(mask_bin)
+    cv2.drawContours(union, valid_cnts, -1, 255, thickness=-1)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (morph_kernel, morph_kernel))
+    union_closed = union.copy()
+    for _ in range(max(1, morph_iters)):
+        union_closed = cv2.morphologyEx(union_closed, cv2.MORPH_CLOSE, kernel)
+    ext = cv2.findContours(union_closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    ext = ext[0] if len(ext) == 2 else ext[1]
+    if not ext:
+        return None
+    largest = max(ext, key=cv2.contourArea)
+    epsilon = approx_epsilon_ratio * cv2.arcLength(largest, True)
+    unified = cv2.approxPolyDP(largest, epsilon, True)
+    return unified
+def compute_hull_area_ratio(mask: np.ndarray, min_area: int = 40) -> float:
+    mask_bin = (mask > 0).astype("uint8") * 255
+    cnts = cv2.findContours(mask_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    if not cnts:
+        return 0.0
+    hull_cnt = compute_unified_contour(mask_bin, cnts, min_area=min_area, method="convex_hull", morph_kernel=15, morph_iters=1)
+    if hull_cnt is None or len(hull_cnt) < 3:
+        return 0.0
+    hull_area = float(cv2.contourArea(hull_cnt))
+    img_area = float(mask_bin.shape[0] * mask_bin.shape[1])
+    return hull_area / max(1.0, img_area)
+def clean_and_fill_mask(mask: np.ndarray, min_area: int = 40) -> np.ndarray:
+    mask_bin = (mask > 0).astype("uint8") * 255
+    cnts = cv2.findContours(mask_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    cleaned = np.zeros_like(mask_bin)
+    for c in cnts:
+        if cv2.contourArea(c) >= max(1, min_area):
+            cv2.drawContours(cleaned, [c], 0, 255, -1)
+    return cleaned
+def generate_final_difference_mask(source_path: str,
+                                   target_path: str,
+                                   hull_min_allowed: float = 0.001,
+                                   hull_max_allowed: float = 0.75,
+                                   pixel_parameters: Optional[list] = None,
+                                   pixel_clean_kernel_default: int = 11,
+                                   color_clean_kernel: int = 3,
+                                   roll_radius: int = 0,
+                                   roll_iters: int = 1) -> Optional[np.ndarray]:
+    if pixel_parameters is None:
+        # Mirrors the tuned combinations used in visualization script
+        pixel_parameters = [(None, 5), (None, 11), (50, 5)]
+    src_img, tgt_img = align_images(source_path, target_path)
+    # Pixel screening across parameter combinations
+    violation = False
+    for thr, ksize in pixel_parameters:
+        pm = generate_pixel_diff_mask(src_img, tgt_img, threshold=thr, clean_kernel_size=ksize)
+        r = compute_hull_area_ratio(pm, min_area=40)
+        if r < hull_min_allowed or r > hull_max_allowed:
+            violation = True
+            break
+    if violation:
+        # Failure: do not produce any mask
+        return None
+    # Color-based final mask → cleaned small areas
+    color_mask = generate_color_diff_mask(src_img, tgt_img, threshold=None, clean_kernel_size=color_clean_kernel)
+    cleaned = clean_and_fill_mask(color_mask, min_area=40)
+    # Produce binary mask from the convex hull contour of the cleaned mask
+    mask_bin = (cleaned > 0).astype("uint8") * 255
+    cnts = cv2.findContours(mask_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    hull_cnt = compute_unified_contour(mask_bin, cnts, min_area=40, method="convex_hull", morph_kernel=15, morph_iters=1)
+    if hull_cnt is None or len(hull_cnt) < 3:
+        return None
+    h_mask = np.zeros_like(mask_bin)
+    cv2.drawContours(h_mask, [hull_cnt], -1, 255, thickness=-1)
+    # Rolling-circle smoothing: closing then opening with a disk of radius R
+    if roll_radius and roll_radius > 0 and roll_iters and roll_iters > 0:
+        ksize = max(1, 2 * int(roll_radius) + 1)
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (ksize, ksize))
+        for _ in range(max(1, roll_iters)):
+            h_mask = cv2.morphologyEx(h_mask, cv2.MORPH_CLOSE, kernel)
+            h_mask = cv2.morphologyEx(h_mask, cv2.MORPH_OPEN, kernel)
+    # Final hull ratio check on the hull-filled binary mask
+    r_final = compute_hull_area_ratio(h_mask, min_area=40)
+    if r_final > hull_max_allowed or r_final < hull_min_allowed:
+        return None
+    return h_mask
+def main():
+    parser = argparse.ArgumentParser(description="Generate final difference mask (single pair or whole dataset)")
+    # Single-pair mode (optional): if provided, runs single pair; otherwise runs dataset mode
+    parser.add_argument("--source", help="Path to source image")
+    parser.add_argument("--target", help="Path to target image")
+    parser.add_argument("--output", help="Path to write the final mask (PNG)")
+    # Dataset mode (defaults to user's dataset paths)
+    parser.add_argument("--dataset_dir", default="/home/lzc/KontextFill/InstructV2V/extracted_dataset", help="Base dataset dir with source_images/ and target_images/")
+    parser.add_argument("--dataset_output_dir", default="/home/lzc/KontextFill/visualizations_masks/inference_masks_smoothing", help="Output directory for batch masks")
+    parser.add_argument("--json_path", default="/home/lzc/KontextFill/InstructV2V/extracted_dataset/extracted_data.json", help="Dataset JSON mapping with fields 'source_image' and 'target_image'")
+    # Common params
+    parser.add_argument("--hull_min_allowed", type=float, default=0.001)
+    parser.add_argument("--hull_max_allowed", type=float, default=0.75)
+    parser.add_argument("--color_clean_kernel", type=int, default=3)
+    parser.add_argument("--roll_radius", type=int, default=15, help="Rolling-circle smoothing radius (pixels); 0 disables")
+    parser.add_argument("--roll_iters", type=int, default=5, help="Rolling smoothing iterations")
+    args = parser.parse_args()
+    pixel_parameters = [(None, 5), (None, 11), (50, 5)]
+    # Decide mode: single or dataset
+    if args.source and args.target and args.output:
+        mask = generate_final_difference_mask(
+            source_path=args.source,
+            target_path=args.target,
+            hull_min_allowed=args.hull_min_allowed,
+            hull_max_allowed=args.hull_max_allowed,
+            pixel_parameters=pixel_parameters,
+            color_clean_kernel=args.color_clean_kernel,
+            roll_radius=args.roll_radius,
+            roll_iters=args.roll_iters,
+        )
+        if mask is None:
+            print("Single-pair inference failed; no output saved.")
+            return
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+        cv2.imwrite(args.output, mask)
+        return
+    # Dataset mode using JSON mapping
+    out_dir = args.dataset_output_dir
+    os.makedirs(out_dir, exist_ok=True)
+    processed = 0
+    skipped = 0
+    failed = 0
+    missing_files = 0
+    try:
+        with open(args.json_path, "r", encoding="utf-8") as f:
+            entries = json.load(f)
+    except Exception as e:
+        print(f"Failed to read JSON mapping at {args.json_path}: {e}")
+        entries = []
+    for item in entries:
+        try:
+            src_rel = item.get("source_image")
+            tgt_rel = item.get("target_image")
+            edit_id = item.get("id")
+            if not src_rel or not tgt_rel:
+                skipped += 1
+                continue
+            s = os.path.join(args.dataset_dir, src_rel)
+            t = os.path.join(args.dataset_dir, tgt_rel)
+            if not (os.path.exists(s) and os.path.exists(t)):
+                missing_files += 1
+                continue
+            mask = generate_final_difference_mask(
+                source_path=s,
+                target_path=t,
+                hull_min_allowed=args.hull_min_allowed,
+                hull_max_allowed=args.hull_max_allowed,
+                pixel_parameters=pixel_parameters,
+                color_clean_kernel=args.color_clean_kernel,
+                roll_radius=args.roll_radius,
+                roll_iters=args.roll_iters,
+            )
+            if mask is None:
+                failed += 1
+                continue
+            name = f"edit_{int(edit_id):04d}" if isinstance(edit_id, int) or (isinstance(edit_id, str) and edit_id.isdigit()) else os.path.splitext(os.path.basename(src_rel))[0]
+            out_path = os.path.join(out_dir, f"{name}.png")
+            cv2.imwrite(out_path, mask)
+            processed += 1
+        except Exception as e:
+            skipped += 1
+            continue
+    print(f"Batch done. Processed={processed}, Failed={failed}, Skipped={skipped}, MissingFiles={missing_files}, OutputDir={out_dir}")
+if __name__ == "__main__":
+    main()

train/src/jsonl_datasets_kontext_color.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from PIL import Image
+from datasets import load_dataset
+from torchvision import transforms
+import random
+import torch
+import os
+from .pipeline_flux_kontext_control import PREFERRED_KONTEXT_RESOLUTIONS
+import numpy as np
+from .jsonl_datasets_colorization import FlexibleColorDetector
+Image.MAX_IMAGE_PIXELS = None
+def multiple_16(num: float):
+    return int(round(num / 16) * 16)
+def load_image_safely(image_path, size, root="/mnt/robby-b1/common/datasets/"):
+    image_path = os.path.join(root, image_path)
+    try:
+        image = Image.open(image_path).convert("RGB")
+        return image
+    except Exception as e:
+        print("file error: "+image_path)
+        with open("failed_images.txt", "a") as f:
+            f.write(f"{image_path}\n")
+        return Image.new("RGB", (size, size), (255, 255, 255))
+def choose_kontext_resolution_from_wh(width: int, height: int):
+    aspect_ratio = width / max(1, height)
+    _, best_w, best_h = min(
+        (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+    )
+    return best_w, best_h
+color_detector = FlexibleColorDetector()
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    # source_pixel_values 被移除，保持兼容返回 None
+    source_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([example["token_ids_clip"] for example in examples])
+    token_ids_t5 = torch.stack([example["token_ids_t5"] for example in examples])
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "source_pixel_values": source_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+    }
+def make_train_dataset_inpaint_mask(args, tokenizers, accelerator=None):
+    # 加载CSV数据集：三列，第一列为图片相对路径，第三列为caption
+    if args.train_data_dir is not None:
+        dataset = load_dataset('csv', data_files=args.train_data_dir)
+    # 列名兼容处理：使用第 0 列作为图片路径，第 2 列作为caption
+    column_names = dataset["train"].column_names
+    image_col = column_names[0]
+    caption_col = column_names[2] if len(column_names) >= 3 else column_names[-1]
+    size = args.cond_size
+    # 设备设置（保留接口，以后需要时可用）
+    if accelerator is not None:
+        device = accelerator.device
+    else:
+        device = "cpu"
+    # Transforms
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    # cond 与 colorization 保持一致：CenterCrop -> ToTensor -> Normalize
+    cond_train_transforms = transforms.Compose([
+        transforms.CenterCrop((size, size)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    tokenizer_clip = tokenizers[0]
+    tokenizer_t5 = tokenizers[1]
+    def tokenize_prompt_clip_t5(examples):
+        captions_raw = examples[caption_col]
+        captions = []
+        for c in captions_raw:
+            if isinstance(c, str):
+                if random.random() < 0.25:
+                    captions.append("")
+                else:
+                    captions.append(c)
+            else:
+                captions.append("")
+        text_inputs_clip = tokenizer_clip(
+            captions,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_1 = text_inputs_clip.input_ids
+        text_inputs_t5 = tokenizer_t5(
+            captions,
+            padding="max_length",
+            max_length=128,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_2 = text_inputs_t5.input_ids
+        return text_input_ids_1, text_input_ids_2
+    def preprocess_train(examples):
+        batch = {}
+        img_paths = examples[image_col]
+        target_tensors = []
+        cond_tensors = []
+        for p in img_paths:
+            # Load image by joining with root in load_image_safely
+            img = load_image_safely(p, size)
+            img = img.convert("RGB")
+            # Resize to Kontext preferred resolution for target
+            w, h = img.size
+            best_w, best_h = choose_kontext_resolution_from_wh(w, h)
+            img_rs = img.resize((best_w, best_h), resample=Image.BILINEAR)
+            target_tensor = to_tensor_and_norm(img_rs)
+            # Build color block condition
+            color_blocks = color_detector(input_image=img, block_size=32, output_size=size)
+            edge_tensor = cond_train_transforms(color_blocks)
+            target_tensors.append(target_tensor)
+            cond_tensors.append(edge_tensor)
+        batch["pixel_values"] = target_tensors
+        batch["cond_pixel_values"] = cond_tensors
+        batch["token_ids_clip"], batch["token_ids_t5"] = tokenize_prompt_clip_t5(examples)
+        return batch
+    if accelerator is not None:
+        with accelerator.main_process_first():
+            train_dataset = dataset["train"].with_transform(preprocess_train)
+    else:
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset

train/src/jsonl_datasets_kontext_complete_lora.py ADDED Viewed

	@@ -0,0 +1,363 @@

+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+import random
+import torch
+import os
+from datasets import load_dataset
+import numpy as np
+import json
+Image.MAX_IMAGE_PIXELS = None
+def _prepend_caption(description: str, obj_name: str) -> str:
+    """Build instruction with stochastic OBJECT choice and keep only instruction with 20% prob.
+    OBJECT choice (equal probability):
+      - literal string "object"
+      - JSON field `object` with '_' replaced by space
+      - JSON field `description`
+    """
+    # Prepare options for OBJECT slot
+    cleaned_obj = (obj_name or "object").replace("_", " ").strip() or "object"
+    desc_opt = (description or "object").strip() or "object"
+    object_slot = random.choice(["object", cleaned_obj, desc_opt])
+    instruction = f"Complete the {object_slot}'s missing parts if necessary. White Background;"
+    return instruction
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    if examples[0].get("source_pixel_values") is not None:
+        source_pixel_values = torch.stack([example["source_pixel_values"] for example in examples])
+        source_pixel_values = source_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        source_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([example["token_ids_clip"] for example in examples])
+    token_ids_t5 = torch.stack([example["token_ids_t5"] for example in examples])
+    mask_values = None
+    if examples[0].get("mask_values") is not None:
+        mask_values = torch.stack([example["mask_values"] for example in examples])
+        mask_values = mask_values.to(memory_format=torch.contiguous_format).float()
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "source_pixel_values": source_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+        "mask_values": mask_values,
+    }
+def _resolve_jsonl(path_str: str):
+    if path_str is None or str(path_str).strip() == "":
+        raise ValueError("train_data_jsonl is empty. Please set --train_data_jsonl to a JSON/JSONL file or a folder.")
+    if os.path.isdir(path_str):
+        files = [
+            os.path.join(path_str, f)
+            for f in os.listdir(path_str)
+            if f.lower().endswith((".jsonl", ".json"))
+        ]
+        if not files:
+            raise ValueError(f"No .json or .jsonl files found under directory: {path_str}")
+        return {"train": sorted(files)}
+    if not os.path.exists(path_str):
+        raise FileNotFoundError(f"train_data_jsonl not found: {path_str}")
+    return {"train": [path_str]}
+def _tokenize(tokenizers, caption: str):
+    tokenizer_clip = tokenizers[0]
+    tokenizer_t5 = tokenizers[1]
+    text_inputs_clip = tokenizer_clip(
+        [caption], padding="max_length", max_length=77, truncation=True, return_tensors="pt"
+    )
+    text_inputs_t5 = tokenizer_t5(
+        [caption], padding="max_length", max_length=128, truncation=True, return_tensors="pt"
+    )
+    return text_inputs_clip.input_ids[0], text_inputs_t5.input_ids[0]
+def _apply_white_brushstrokes(image_np: np.ndarray, mask_bin: np.ndarray = None) -> np.ndarray:
+    """Draw random white brushstrokes on the RGB image array and return modified array.
+    Strokes preferentially start within mask_bin if provided.
+    """
+    import cv2
+    h, w = image_np.shape[:2]
+    rng = random.Random()
+    # Determine stroke counts and sizes based on image size
+    ref = max(1, min(h, w))
+    num_strokes = rng.randint(1, 5)
+    max_offset = max(5, ref // 40)
+    min_th = max(2, ref // 40)
+    max_th = max(min_th + 1, ref // 5)
+    out = image_np.copy()
+    prefer_mask_p = 0.33 if mask_bin is not None and mask_bin.any() else 0.0
+    def rand_point_inside_mask():
+        ys, xs = np.where(mask_bin > 0)
+        if len(xs) == 0:
+            return rng.randrange(w), rng.randrange(h)
+        i = rng.randrange(len(xs))
+        return int(xs[i]), int(ys[i])
+    def rand_point_any():
+        return rng.randrange(w), rng.randrange(h)
+    for _ in range(num_strokes):
+        if rng.random() < prefer_mask_p:
+            px, py = rand_point_inside_mask()
+        else:
+            px, py = rand_point_any()
+        px, py = rand_point_any()
+        # Polyline with several jittered segments
+        segments = rng.randint(40, 80)
+        thickness = rng.randint(min_th, max_th)
+        for _ in range(segments):
+            dx = rng.randint(-max_offset, max_offset)
+            dy = rng.randint(-max_offset, max_offset)
+            nx = int(np.clip(px + dx, 0, w - 1))
+            ny = int(np.clip(py + dy, 0, h - 1))
+            cv2.line(out, (px, py), (nx, ny), (255, 255, 255), thickness)
+            px, py = nx, ny
+    return out
+def make_train_dataset_subjects(args, tokenizers, accelerator=None):
+    """
+    Dataset for JSONL with fields (one JSON object per line):
+      - white_image_path: absolute path to base image used for both pixel_values and source_pixel_values
+      - mask_path: absolute path to mask image (grayscale)
+      - img_width: target width
+      - img_height: target height
+      - description: caption text
+    Behavior:
+      - pixel_values = white_image_path resized to (img_width, img_height)
+      - source_pixel_values = same image but with random white brushstrokes overlaid
+      - mask_values = binarized mask from mask_path resized with nearest neighbor
+      - captions tokenized from description
+    """
+    data_files = _resolve_jsonl(getattr(args, "train_data_jsonl", None))
+    file_paths = data_files.get("train", [])
+    records = []
+    for p in file_paths:
+        with open(p, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    # Best-effort: strip any trailing commas and retry
+                    try:
+                        obj = json.loads(line.rstrip(","))
+                    except Exception:
+                        continue
+                # Keep only fields we need for this dataset schema
+                pruned = {
+                    "white_image_path": obj.get("white_image_path"),
+                    "mask_path": obj.get("mask_path"),
+                    "img_width": obj.get("img_width"),
+                    "img_height": obj.get("img_height"),
+                    "description": obj.get("description"),
+                    "object": obj.get("object"),
+                }
+                records.append(pruned)
+    size = int(getattr(args, "cond_size", 512))
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    # Repeat each record with independent random brushstrokes
+    REPEATS_PER_IMAGE = 5
+    class SubjectsDataset(torch.utils.data.Dataset):
+        def __init__(self, hf_ds):
+            self.ds = hf_ds
+            self.repeats = REPEATS_PER_IMAGE
+        def __len__(self):
+            if self.repeats and self.repeats > 1:
+                return len(self.ds) * self.repeats
+            return len(self.ds)
+        def __getitem__(self, idx):
+            if self.repeats and self.repeats > 1:
+                base_idx = idx % len(self.ds)
+            else:
+                base_idx = idx
+            rec = self.ds[base_idx]
+            white_p = rec.get("white_image_path", "") or ""
+            mask_p = rec.get("mask_path", "") or ""
+            if not os.path.isabs(white_p):
+                # Allow absolute path only to avoid ambiguity
+                raise ValueError("white_image_path must be absolute")
+            if not os.path.isabs(mask_p):
+                raise ValueError("mask_path must be absolute")
+            import cv2
+            mask_loaded = cv2.imread(mask_p, cv2.IMREAD_GRAYSCALE)
+            if mask_loaded is None:
+                raise ValueError(f"Failed to read mask: {mask_p}")
+            base_img = Image.open(white_p).convert("RGB")
+            # Desired output size
+            fw = int(rec.get("img_width") or base_img.width)
+            fh = int(rec.get("img_height") or base_img.height)
+            base_img = base_img.resize((fw, fh), resample=Image.BILINEAR)
+            mask_img = Image.fromarray(mask_loaded.astype(np.uint8)).convert("L").resize((fw, fh), Image.NEAREST)
+            # Tensors: target is the clean white image
+            target_tensor = to_tensor_and_norm(base_img)
+            # Binary mask at final_size
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            # Build source by drawing random white brushstrokes on top of the white image
+            base_np = np.array(base_img).astype(np.uint8)
+            stroked_np = _apply_white_brushstrokes(base_np, mask_bin)
+            # Build tensors
+            source_tensor = to_tensor_and_norm(Image.fromarray(stroked_np.astype(np.uint8)))
+            mask_tensor = torch.from_numpy(mask_bin.astype(np.float32)).unsqueeze(0)
+            # Caption: build instruction using description and object
+            description = rec.get("description", "")
+            obj_name = rec.get("object", "")
+            cap = _prepend_caption(description, obj_name)
+            ids1, ids2 = _tokenize(tokenizers, cap)
+            return {
+                "source_pixel_values": source_tensor,
+                "pixel_values": target_tensor,
+                "token_ids_clip": ids1,
+                "token_ids_t5": ids2,
+                "mask_values": mask_tensor,
+            }
+    return SubjectsDataset(records)
+def _run_test_mode(test_jsonl: str, output_dir: str, num_samples: int = 50):
+    """Utility to visualize augmentation: saves pairs of (target, source) images.
+    Reads the JSONL directly, applies the same logic as dataset to produce
+    pixel_values (target) and source_pixel_values (with white strokes),
+    then writes them to output_dir for manual inspection.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    # Minimal tokenizers shim to reuse dataset tokenization pipeline
+    class _NoOpTokenizer:
+        def __call__(self, texts, padding=None, max_length=None, truncation=None, return_tensors=None):
+            return type("T", (), {"input_ids": torch.zeros((1, 1), dtype=torch.long)})()
+    tokenizers = [_NoOpTokenizer(), _NoOpTokenizer()]
+    saved = 0
+    line_idx = 0
+    import cv2
+    with open(test_jsonl, "r", encoding="utf-8") as f:
+        for raw in f:
+            if saved >= num_samples:
+                break
+            raw = raw.strip()
+            if not raw:
+                continue
+            try:
+                obj = json.loads(raw)
+            except Exception:
+                try:
+                    obj = json.loads(raw.rstrip(","))
+                except Exception:
+                    continue
+            rec = {
+                "white_image_path": obj.get("white_image_path"),
+                "mask_path": obj.get("mask_path"),
+                "img_width": obj.get("img_width"),
+                "img_height": obj.get("img_height"),
+                "description": obj.get("description"),
+            }
+            white_p = rec.get("white_image_path", "") or ""
+            mask_p = rec.get("mask_path", "") or ""
+            if not white_p or not mask_p:
+                continue
+            if not (os.path.isabs(white_p) and os.path.isabs(mask_p)):
+                continue
+            mask_loaded = cv2.imread(mask_p, cv2.IMREAD_GRAYSCALE)
+            if mask_loaded is None:
+                continue
+            try:
+                base_img = Image.open(white_p).convert("RGB")
+            except Exception:
+                continue
+            fw = int(rec.get("img_width") or base_img.width)
+            fh = int(rec.get("img_height") or base_img.height)
+            base_img = base_img.resize((fw, fh), resample=Image.BILINEAR)
+            mask_img = Image.fromarray(mask_loaded.astype(np.uint8)).convert("L").resize((fw, fh), Image.NEAREST)
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            base_np = np.array(base_img).astype(np.uint8)
+            stroked_np = _apply_white_brushstrokes(base_np, mask_bin)
+            # Save images
+            idx_str = f"{line_idx:05d}"
+            try:
+                Image.fromarray(base_np).save(os.path.join(output_dir, f"{idx_str}_target.jpg"))
+                Image.fromarray(stroked_np).save(os.path.join(output_dir, f"{idx_str}_source.jpg"))
+                Image.fromarray((mask_bin * 255).astype(np.uint8)).save(os.path.join(output_dir, f"{idx_str}_mask.png"))
+                saved += 1
+            except Exception:
+                pass
+            line_idx += 1
+def _parse_test_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Test visualization for Kontext complete dataset")
+    parser.add_argument("--test_jsonl", type=str, default="/robby/share/Editing/lzc/subject_completion/white_bg_picked/results_picked_filtered.jsonl", help="Path to JSONL to preview")
+    parser.add_argument("--output_dir", type=str, default="/robby/share/Editing/lzc/subject_completion/train_test", help="Output directory to save pairs")
+    parser.add_argument("--num_samples", type=int, default=50, help="Number of pairs to save")
+    return parser.parse_args()
+if __name__ == "__main__":
+    try:
+        args = _parse_test_args()
+        _run_test_mode(args.test_jsonl, args.output_dir, args.num_samples)
+    except SystemExit:
+        # Allow import usage without triggering test mode
+        pass

train/src/jsonl_datasets_kontext_edge.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from PIL import Image
+from datasets import load_dataset
+from torchvision import transforms
+import random
+import torch
+import os
+from .pipeline_flux_kontext_control import PREFERRED_KONTEXT_RESOLUTIONS
+import numpy as np
+from src.condition.edge_extraction import (
+    CannyDetector, PidiNetDetector, TEDDetector, LineartStandardDetector, HEDdetector,
+    AnyLinePreprocessor, LineartDetector, InformativeDetector
+)
+Image.MAX_IMAGE_PIXELS = None
+def multiple_16(num: float):
+    return int(round(num / 16) * 16)
+def load_image_safely(image_path, size, root="/mnt/robby-b1/common/datasets/"):
+    image_path = os.path.join(root, image_path)
+    try:
+        image = Image.open(image_path).convert("RGB")
+        return image
+    except Exception as e:
+        print("file error: "+image_path)
+        with open("failed_images.txt", "a") as f:
+            f.write(f"{image_path}\n")
+        return Image.new("RGB", (size, size), (255, 255, 255))
+def choose_kontext_resolution_from_wh(width: int, height: int):
+    aspect_ratio = width / max(1, height)
+    _, best_w, best_h = min(
+        (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+    )
+    return best_w, best_h
+class EdgeExtractorManager:
+    _instance = None
+    _initialized = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(EdgeExtractorManager, cls).__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not self._initialized:
+            self.edge_extractors = None
+            self.device = None
+            self._initialized = True
+    def set_device(self, device):
+        self.device = device
+    def get_edge_extractors(self, device=None):
+        # 强制在CPU上初始化，避免DataLoader子进程中触发CUDA初始化
+        current_device = "cpu"
+        if device is not None:
+            self.set_device(current_device)
+        if self.edge_extractors is None or len(self.edge_extractors) == 0:
+            self.edge_extractors = [
+                ("canny", CannyDetector()),
+                ("pidinet", PidiNetDetector.from_pretrained().to(current_device)),
+                ("ted", TEDDetector.from_pretrained().to(current_device)),
+                # ("lineart_standard", LineartStandardDetector()),
+                ("hed",  HEDdetector.from_pretrained().to(current_device)),
+                ("anyline", AnyLinePreprocessor.from_pretrained().to(current_device)),
+                # ("lineart", LineartDetector.from_pretrained().to(current_device)),
+                ("informative", InformativeDetector.from_pretrained().to(current_device)),
+            ]
+        return self.edge_extractors
+edge_extractor_manager = EdgeExtractorManager()
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    source_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([example["token_ids_clip"] for example in examples])
+    token_ids_t5 = torch.stack([example["token_ids_t5"] for example in examples])
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "source_pixel_values": source_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+    }
+def make_train_dataset_inpaint_mask(args, tokenizers, accelerator=None):
+    # 加载CSV数据集：三列，第一列为图片相对路径，第三列为caption
+    if args.train_data_dir is not None:
+        dataset = load_dataset('csv', data_files=args.train_data_dir)
+    # 列名兼容处理：使用第 0 列作为图片路径，第 2 列作为caption
+    column_names = dataset["train"].column_names
+    image_col = column_names[0]
+    caption_col = column_names[2] if len(column_names) >= 3 else column_names[-1]
+    size = args.cond_size
+    # 设备设置（用于分布式时将部分检测器放到对应GPU）
+    if accelerator is not None:
+        device = accelerator.device
+        edge_extractor_manager.set_device(device)
+    else:
+        device = "cpu"
+    # Transforms
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    # 与 jsonl_datasets_edge.py 保持一致：Resize -> CenterCrop -> ToTensor -> Normalize
+    cond_train_transforms = transforms.Compose([
+        transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR),
+        transforms.CenterCrop((size, size)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    tokenizer_clip = tokenizers[0]
+    tokenizer_t5 = tokenizers[1]
+    def tokenize_prompt_clip_t5(examples):
+        captions_raw = examples[caption_col]
+        captions = []
+        for c in captions_raw:
+            if isinstance(c, str):
+                if random.random() < 0.25:
+                    captions.append("")
+                else:
+                    captions.append(c)
+            else:
+                captions.append("")
+        text_inputs_clip = tokenizer_clip(
+            captions,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_1 = text_inputs_clip.input_ids
+        text_inputs_t5 = tokenizer_t5(
+            captions,
+            padding="max_length",
+            max_length=128,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_2 = text_inputs_t5.input_ids
+        return text_input_ids_1, text_input_ids_2
+    def preprocess_train(examples):
+        batch = {}
+        img_paths = examples[image_col]
+        target_tensors = []
+        cond_tensors = []
+        for p in img_paths:
+            # Load image by joining with root in load_image_safely
+            img = load_image_safely(p, size)
+            img = img.convert("RGB")
+            # Resize to Kontext preferred resolution for target
+            w, h = img.size
+            best_w, best_h = choose_kontext_resolution_from_wh(w, h)
+            img_rs = img.resize((best_w, best_h), resample=Image.BILINEAR)
+            target_tensor = to_tensor_and_norm(img_rs)
+            # Build edge condition
+            extractor_name, extractor = random.choice(edge_extractor_manager.get_edge_extractors())
+            img_np = np.array(img)
+            if extractor_name == "informative":
+                edge = extractor(img_np, style="contour")
+            else:
+                edge = extractor(img_np)
+            if extractor_name == "ted":
+                th = 128
+            else:
+                th = 32
+            edge_np = np.array(edge) if isinstance(edge, Image.Image) else edge
+            if edge_np.ndim == 3:
+                edge_np = edge_np[..., 0]
+            edge_bin = (edge_np > th).astype(np.float32)
+            edge_pil = Image.fromarray((edge_bin * 255).astype(np.uint8))
+            edge_tensor = cond_train_transforms(edge_pil)
+            edge_tensor = edge_tensor.repeat(3, 1, 1)
+            target_tensors.append(target_tensor)
+            cond_tensors.append(edge_tensor)
+        batch["pixel_values"] = target_tensors
+        batch["cond_pixel_values"] = cond_tensors
+        batch["token_ids_clip"], batch["token_ids_t5"] = tokenize_prompt_clip_t5(examples)
+        return batch
+    if accelerator is not None:
+        with accelerator.main_process_first():
+            train_dataset = dataset["train"].with_transform(preprocess_train)
+    else:
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset

train/src/jsonl_datasets_kontext_interactive_lora.py ADDED Viewed

	@@ -0,0 +1,1332 @@

+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+import random
+import torch
+import os
+from datasets import load_dataset
+import numpy as np
+import json
+Image.MAX_IMAGE_PIXELS = None
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    if examples[0].get("source_pixel_values") is not None:
+        source_pixel_values = torch.stack([example["source_pixel_values"] for example in examples])
+        source_pixel_values = source_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        source_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([example["token_ids_clip"] for example in examples])
+    token_ids_t5 = torch.stack([example["token_ids_t5"] for example in examples])
+    mask_values = None
+    if examples[0].get("mask_values") is not None:
+        mask_values = torch.stack([example["mask_values"] for example in examples])
+        mask_values = mask_values.to(memory_format=torch.contiguous_format).float()
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "source_pixel_values": source_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+        "mask_values": mask_values,
+    }
+def _resolve_jsonl(path_str: str):
+    if path_str is None or str(path_str).strip() == "":
+        raise ValueError("train_data_jsonl is empty. Please set --train_data_jsonl to a JSON/JSONL file or a folder.")
+    if os.path.isdir(path_str):
+        files = [
+            os.path.join(path_str, f)
+            for f in os.listdir(path_str)
+            if f.lower().endswith((".jsonl", ".json"))
+        ]
+        if not files:
+            raise ValueError(f"No .json or .jsonl files found under directory: {path_str}")
+        return {"train": sorted(files)}
+    if not os.path.exists(path_str):
+        raise FileNotFoundError(f"train_data_jsonl not found: {path_str}")
+    return {"train": [path_str]}
+def _tokenize(tokenizers, caption: str):
+    tokenizer_clip = tokenizers[0]
+    tokenizer_t5 = tokenizers[1]
+    text_inputs_clip = tokenizer_clip(
+        [caption], padding="max_length", max_length=77, truncation=True, return_tensors="pt"
+    )
+    text_inputs_t5 = tokenizer_t5(
+        [caption], padding="max_length", max_length=128, truncation=True, return_tensors="pt"
+    )
+    return text_inputs_clip.input_ids[0], text_inputs_t5.input_ids[0]
+def _prepend_caption(caption: str) -> str:
+    """Prepend instruction and keep only instruction with 20% prob."""
+    instruction = "Fill in the white region naturally and adapt the foreground into the background. Fix the perspective of the foreground object if necessary."
+    if random.random() < 0.2:
+        return instruction
+    caption = caption or ""
+    if caption.strip():
+        return f"{instruction} {caption.strip()}"
+    return instruction
+def _color_augment(pil_img: Image.Image) -> Image.Image:
+    brightness = random.uniform(0.8, 1.2)
+    contrast = random.uniform(0.8, 1.2)
+    saturation = random.uniform(0.8, 1.2)
+    hue = random.uniform(-0.05, 0.05)
+    img = TF.adjust_brightness(pil_img, brightness)
+    img = TF.adjust_contrast(img, contrast)
+    img = TF.adjust_saturation(img, saturation)
+    img = TF.adjust_hue(img, hue)
+    return img
+def _dilate_mask(mask_bin: np.ndarray, min_px: int = 5, max_px: int = 100) -> np.ndarray:
+    """Grow binary mask by a random radius in [min_px, max_px]. Expects values {0,1}."""
+    import cv2
+    radius = int(max(min_px, min(max_px, random.randint(min_px, max_px))))
+    if radius <= 0:
+        return mask_bin.astype(np.uint8)
+    ksize = 2 * radius + 1
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (ksize, ksize))
+    grown = cv2.dilate(mask_bin.astype(np.uint8), kernel, iterations=1)
+    return (grown > 0).astype(np.uint8)
+def _random_point_inside_mask(mask_bin: np.ndarray) -> tuple:
+    ys, xs = np.where(mask_bin > 0)
+    if len(xs) == 0:
+        h, w = mask_bin.shape
+        return w // 2, h // 2
+    idx = random.randrange(len(xs))
+    return int(xs[idx]), int(ys[idx])
+def _bbox_containing_mask(mask_bin: np.ndarray, img_w: int, img_h: int) -> tuple:
+    ys, xs = np.where(mask_bin > 0)
+    if len(xs) == 0:
+        return 0, 0, img_w - 1, img_h - 1
+    x1, x2 = int(xs.min()), int(xs.max())
+    y1, y2 = int(ys.min()), int(ys.max())
+    # Random padding
+    max_pad = int(0.25 * min(img_w, img_h))
+    pad_x1 = random.randint(0, max_pad)
+    pad_x2 = random.randint(0, max_pad)
+    pad_y1 = random.randint(0, max_pad)
+    pad_y2 = random.randint(0, max_pad)
+    x1 = max(0, x1 - pad_x1)
+    y1 = max(0, y1 - pad_y1)
+    x2 = min(img_w - 1, x2 + pad_x2)
+    y2 = min(img_h - 1, y2 + pad_y2)
+    return x1, y1, x2, y2
+def _constrained_random_mask(mask_bin: np.ndarray, image_h: int, image_w: int, aug_prob: float = 0.7) -> np.ndarray:
+    """Generate random mask whose box contains or starts in m_p, and brush strokes start inside m_p.
+    Returns binary 0/1 array of shape (H,W).
+    """
+    import cv2
+    if random.random() >= aug_prob:
+        return np.zeros((image_h, image_w), dtype=np.uint8)
+    # Scale similar to reference
+    ref_size = 1024
+    scale_factor = max(1.0, min(image_h, image_w) / float(ref_size))
+    out = np.zeros((image_h, image_w), dtype=np.uint8)
+    # Choose exactly one augmentation: bbox OR stroke
+    if random.random() < 0.2:
+        # BBox augmentation: draw N boxes (randomized), first box often contains mask
+        num_boxes = random.randint(1, 6)
+        for b in range(num_boxes):
+            if b == 0 and random.random() < 0.5:
+                x1, y1, x2, y2 = _bbox_containing_mask(mask_bin, image_w, image_h)
+            else:
+                sx, sy = _random_point_inside_mask(mask_bin)
+                max_w = int(500 * scale_factor)
+                min_w = int(100 * scale_factor)
+                bw = random.randint(max(1, min_w), max(2, max_w))
+                bh = random.randint(max(1, min_w), max(2, max_w))
+                x1 = max(0, sx - random.randint(0, bw))
+                y1 = max(0, sy - random.randint(0, bh))
+                x2 = min(image_w - 1, x1 + bw)
+                y2 = min(image_h - 1, y1 + bh)
+            out[y1:y2 + 1, x1:x2 + 1] = 1
+    else:
+        # Stroke augmentation: draw N strokes starting inside mask
+        num_strokes = random.randint(1, 6)
+        for _ in range(num_strokes):
+            num_points = random.randint(10, 30)
+            stroke_width = random.randint(max(1, int(100 * scale_factor)), max(2, int(400 * scale_factor)))
+            max_offset = max(1, int(100 * scale_factor))
+            start_x, start_y = _random_point_inside_mask(mask_bin)
+            px, py = start_x, start_y
+            for _ in range(num_points):
+                dx = random.randint(-max_offset, max_offset)
+                dy = random.randint(-max_offset, max_offset)
+                nx = int(np.clip(px + dx, 0, image_w - 1))
+                ny = int(np.clip(py + dy, 0, image_h - 1))
+                cv2.line(out, (px, py), (nx, ny), 1, stroke_width)
+                px, py = nx, ny
+    return (out > 0).astype(np.uint8)
+def make_placement_dataset_subjects(args, tokenizers, accelerator=None, base_dir=None):
+    """
+    Dataset for JSONL with fields:
+      - generated_image_path: relative to base_dir (target image with object)
+      - mask_path: relative to base_dir (mask of object)
+      - generated_width, generated_height: image dimensions
+      - final_prompt: caption
+      - relight_images: list of {mode, path} for relighted versions
+    source image construction:
+      - background is target_image with a hole punched by grown mask
+      - foreground is randomly selected from relight_images with weights
+      - includes perspective transformation (moved from interactive dataset)
+    Args:
+        base_dir: Base directory for resolving relative paths. If None, uses args.placement_base_dir.
+    """
+    if base_dir is None:
+        base_dir = getattr(args, "placement_base_dir")
+    data_files = _resolve_jsonl(getattr(args, "placement_data_jsonl", None))
+    file_paths = data_files.get("train", [])
+    records = []
+    for p in file_paths:
+        with open(p, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    try:
+                        obj = json.loads(line.rstrip(","))
+                    except Exception:
+                        continue
+                # Keep only fields we need
+                pruned = {
+                    "generated_image_path": obj.get("generated_image_path"),
+                    "mask_path": obj.get("mask_path"),
+                    "generated_width": obj.get("generated_width"),
+                    "generated_height": obj.get("generated_height"),
+                    "final_prompt": obj.get("final_prompt"),
+                    "relight_images": obj.get("relight_images"),
+                }
+                records.append(pruned)
+    size = int(getattr(args, "cond_size", 512))
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    class PlacementDataset(torch.utils.data.Dataset):
+        def __init__(self, hf_ds, base_dir):
+            self.ds = hf_ds
+            self.base_dir = base_dir
+        def __len__(self):
+            # Triplicate sampling per record
+            return len(self.ds)
+        def __getitem__(self, idx):
+            rec = self.ds[idx % len(self.ds)]
+            t_rel = rec.get("generated_image_path", "")
+            m_rel = rec.get("mask_path", "")
+            # Both are relative paths
+            t_p = os.path.join(self.base_dir, t_rel)
+            m_p = os.path.join(self.base_dir, m_rel)
+            import cv2
+            mask_loaded = cv2.imread(m_p, cv2.IMREAD_GRAYSCALE)
+            if mask_loaded is None:
+                raise ValueError(f"Failed to read mask: {m_p}")
+            tgt_img = Image.open(t_p).convert("RGB")
+            fw = int(rec.get("generated_width", tgt_img.width))
+            fh = int(rec.get("generated_height", tgt_img.height))
+            tgt_img = tgt_img.resize((fw, fh), resample=Image.BILINEAR)
+            mask_img = Image.fromarray(mask_loaded.astype(np.uint8)).convert("L").resize((fw, fh), Image.NEAREST)
+            target_tensor = to_tensor_and_norm(tgt_img)
+            # Binary mask at final_size
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            # 1) Grow mask by random 50-100 pixels
+            grown_mask = _dilate_mask(mask_bin, 50, 200)
+            # 2) Optional random augmentation mask constrained by mask
+            rand_mask = _constrained_random_mask(mask_bin, fh, fw, 7)
+            # 3) Final union mask
+            union_mask = np.clip(grown_mask | rand_mask, 0, 1).astype(np.uint8)
+            tgt_np = np.array(tgt_img)
+            # Helper: choose relighted image from relight_images with weights
+            def _choose_relight_image(rec, width, height):
+                relight_list = rec.get("relight_images") or []
+                # Build map mode -> path
+                mode_to_path = {}
+                for it in relight_list:
+                    try:
+                        mode = str(it.get("mode", "")).strip().lower()
+                        path = it.get("path")
+                    except Exception:
+                        continue
+                    if not mode or not path:
+                        continue
+                    mode_to_path[mode] = path
+                weighted_order = [
+                    ("grayscale", 0.5),
+                    ("low", 0.3),
+                    ("high", 0.2),
+                ]
+                # Filter to available
+                available = [(m, w) for (m, w) in weighted_order if m in mode_to_path]
+                chosen_path = None
+                if available:
+                    rnd = random.random()
+                    cum = 0.0
+                    total_w = sum(w for _, w in available)
+                    for m, w in available:
+                        cum += w / total_w
+                        if rnd <= cum:
+                            chosen_path = mode_to_path.get(m)
+                            break
+                    if chosen_path is None:
+                        chosen_path = mode_to_path.get(available[-1][0])
+                else:
+                    # Fallback to any provided path
+                    if mode_to_path:
+                        chosen_path = next(iter(mode_to_path.values()))
+                # Open chosen image
+                if chosen_path is not None:
+                    try:
+                        # Paths are relative to base_dir
+                        open_path = os.path.join(self.base_dir, chosen_path)
+                        img = Image.open(open_path).convert("RGB").resize((width, height), resample=Image.BILINEAR)
+                        return img
+                    except Exception:
+                        pass
+                # Fallback: return target image
+                return Image.open(t_p).convert("RGB").resize((width, height), resample=Image.BILINEAR)
+            # Choose base image with probabilities:
+            # 20%: original target, 20%: color augment(target), 60%: relight augment
+            rsel = random.random()
+            if rsel < 0.2:
+                base_img = tgt_img
+            elif rsel < 0.4:
+                base_img = _color_augment(tgt_img)
+            else:
+                base_img = _choose_relight_image(rec, fw, fh)
+            base_np = np.array(base_img)
+            fore_np = base_np.copy()
+            # Random perspective augmentation (50%): apply to foreground ROI (mask bbox) and its mask only
+            perspective_applied = False
+            roi_update = None
+            paste_mask_bool = mask_bin.astype(bool)
+            if random.random() < 0.5:
+                try:
+                    import cv2
+                    ys, xs = np.where(mask_bin > 0)
+                    if len(xs) > 0 and len(ys) > 0:
+                        x1, x2 = int(xs.min()), int(xs.max())
+                        y1, y2 = int(ys.min()), int(ys.max())
+                        if x2 > x1 and y2 > y1:
+                            roi = base_np[y1:y2 + 1, x1:x2 + 1]
+                            roi_mask = mask_bin[y1:y2 + 1, x1:x2 + 1]
+                            bh, bw = roi.shape[:2]
+                            # Random perturbation relative to ROI size
+                            max_ratio = random.uniform(0.1, 0.3)
+                            dx = bw * max_ratio
+                            dy = bh * max_ratio
+                            src = np.float32([[0, 0], [bw - 1, 0], [bw - 1, bh - 1], [0, bh - 1]])
+                            dst = np.float32([
+                                [np.clip(random.uniform(-dx, dx), 0, bw - 1), np.clip(random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(bw - 1 + random.uniform(-dx, dx), 0, bw - 1), np.clip(random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(bw - 1 + random.uniform(-dx, dx), 0, bw - 1), np.clip(bh - 1 + random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(random.uniform(-dx, dx), 0, bw - 1), np.clip(bh - 1 + random.uniform(-dy, dy), 0, bh - 1)],
+                            ])
+                            M = cv2.getPerspectiveTransform(src, dst)
+                            warped_roi = cv2.warpPerspective(roi, M, (bw, bh), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT101)
+                            warped_mask_roi = cv2.warpPerspective((roi_mask.astype(np.uint8) * 255), M, (bw, bh), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0) > 127
+                            # Build a fresh foreground canvas
+                            fore_np = np.zeros_like(base_np)
+                            h_warp, w_warp = warped_mask_roi.shape
+                            y2c = y1 + h_warp
+                            x2c = x1 + w_warp
+                            fore_np[y1:y2c, x1:x2c][warped_mask_roi] = warped_roi[warped_mask_roi]
+                            paste_mask_bool = np.zeros_like(mask_bin, dtype=bool)
+                            paste_mask_bool[y1:y2c, x1:x2c] = warped_mask_roi
+                            roi_update = (x1, y1, h_warp, w_warp, warped_mask_roi)
+                            perspective_applied = True
+                except Exception:
+                    perspective_applied = False
+                    paste_mask_bool = mask_bin.astype(bool)
+                    fore_np = base_np
+            # Optional: simulate resolution artifacts
+            if random.random() < 0.7:
+                ys, xs = np.where(paste_mask_bool)
+                if len(xs) > 0 and len(ys) > 0:
+                    x1, x2 = int(xs.min()), int(xs.max())
+                    y1, y2 = int(ys.min()), int(ys.max())
+                    if x2 > x1 and y2 > y1:
+                        crop = fore_np[y1:y2 + 1, x1:x2 + 1]
+                        ch, cw = crop.shape[:2]
+                        scale = random.uniform(0.15, 0.9)
+                        dw = max(1, int(cw * scale))
+                        dh = max(1, int(ch * scale))
+                        try:
+                            small = Image.fromarray(crop.astype(np.uint8)).resize((dw, dh), Image.BICUBIC)
+                            back = small.resize((cw, ch), Image.BICUBIC)
+                            crop_blurred = np.array(back).astype(np.uint8)
+                            fore_np[y1:y2 + 1, x1:x2 + 1] = crop_blurred
+                        except Exception:
+                            pass
+            # Build masked target and compose
+            union_mask_for_target = union_mask.copy()
+            if roi_update is not None:
+                rx, ry, rh, rw, warped_mask_roi = roi_update
+                um_roi = union_mask_for_target[ry:ry + rh, rx:rx + rw]
+                union_mask_for_target[ry:ry + rh, rx:rx + rw] = np.clip(um_roi | warped_mask_roi.astype(np.uint8), 0, 1)
+            masked_t_np = tgt_np.copy()
+            masked_t_np[union_mask_for_target.astype(bool)] = 255
+            composed_np = masked_t_np.copy()
+            m_fore = paste_mask_bool
+            composed_np[m_fore] = fore_np[m_fore]
+            # Build tensors
+            source_tensor = to_tensor_and_norm(Image.fromarray(composed_np.astype(np.uint8)))
+            mask_tensor = torch.from_numpy(union_mask.astype(np.float32)).unsqueeze(0)
+            # Caption: prepend instruction
+            cap_orig = rec.get("final_prompt", "") or ""
+            # Handle list format in final_prompt
+            if isinstance(cap_orig, list) and len(cap_orig) > 0:
+                cap_orig = cap_orig[0] if isinstance(cap_orig[0], str) else str(cap_orig[0])
+            cap = _prepend_caption(cap_orig)
+            if perspective_applied:
+                cap = f"{cap} Fix the perspective if necessary."
+            ids1, ids2 = _tokenize(tokenizers, cap)
+            return {
+                "source_pixel_values": source_tensor,
+                "pixel_values": target_tensor,
+                "token_ids_clip": ids1,
+                "token_ids_t5": ids2,
+                "mask_values": mask_tensor,
+            }
+    return PlacementDataset(records, base_dir)
+def make_interactive_dataset_subjects(args, tokenizers, accelerator=None, base_dir=None):
+    """
+    Dataset for JSONL with fields:
+      - input_path: relative to base_dir (target image)
+      - output_path: absolute path to image with foreground
+      - mask_after_completion: absolute path to mask
+      - img_width, img_height: resize dimensions
+      - prompt: caption
+    source image construction:
+      - background is target_image with a hole punched by grown `mask_after_completion`
+      - foreground is from `output_path` image, pasted using original `mask_after_completion`
+      - 50% chance to color augment the foreground source
+      - NO perspective transform (moved to placement dataset)
+    Args:
+        base_dir: Base directory for resolving relative paths. If None, uses args.interactive_base_dir.
+    """
+    if base_dir is None:
+        base_dir = getattr(args, "interactive_base_dir")
+    data_files = _resolve_jsonl(getattr(args, "train_data_jsonl", None))
+    file_paths = data_files.get("train", [])
+    records = []
+    for p in file_paths:
+        with open(p, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    # Best-effort: strip any trailing commas and retry
+                    try:
+                        obj = json.loads(line.rstrip(","))
+                    except Exception:
+                        continue
+                # Keep only fields we actually need to avoid schema issues
+                pruned = {
+                    "input_path": obj.get("input_path"),
+                    "output_path": obj.get("output_path"),
+                    "mask_after_completion": obj.get("mask_after_completion"),
+                    "img_width": obj.get("img_width"),
+                    "img_height": obj.get("img_height"),
+                    "prompt": obj.get("prompt"),
+                    # New optional fields
+                    "generated_images": obj.get("generated_images"),
+                    "positive_prompt_used": obj.get("positive_prompt_used"),
+                    "negative_caption_used": obj.get("negative_caption_used"),
+                }
+                records.append(pruned)
+    size = int(getattr(args, "cond_size", 512))
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    class SubjectsDataset(torch.utils.data.Dataset):
+        def __init__(self, hf_ds, base_dir):
+            self.ds = hf_ds
+            self.base_dir = base_dir
+        def __len__(self):
+            # Triplicate sampling per record
+            return len(self.ds)
+        def __getitem__(self, idx):
+            rec = self.ds[idx % len(self.ds)]
+            t_rel = rec.get("input_path", "")
+            foreground_p = rec.get("output_path", "")
+            m_abs = rec.get("mask_after_completion", "")
+            if not os.path.isabs(m_abs):
+                raise ValueError("mask_after_completion must be absolute")
+            if not os.path.isabs(foreground_p):
+                raise ValueError("output_path must be absolute")
+            t_p = os.path.join(self.base_dir, t_rel)
+            m_p = m_abs
+            import cv2
+            mask_loaded = cv2.imread(m_p, cv2.IMREAD_GRAYSCALE)
+            if mask_loaded is None:
+                raise ValueError(f"Failed to read mask: {m_p}")
+            tgt_img = Image.open(t_p).convert("RGB")
+            foreground_source_img = Image.open(foreground_p).convert("RGB")
+            fw = int(rec.get("img_width", tgt_img.width))
+            fh = int(rec.get("img_height", tgt_img.height))
+            tgt_img = tgt_img.resize((fw, fh), resample=Image.BILINEAR)
+            foreground_source_img = foreground_source_img.resize((fw, fh), resample=Image.BILINEAR)
+            mask_img = Image.fromarray(mask_loaded.astype(np.uint8)).convert("L").resize((fw, fh), Image.NEAREST)
+            # Ensure PIL images to tensors for outputs based on new logic later
+            target_tensor = to_tensor_and_norm(tgt_img)
+            # Binary mask at final_size
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            # 1) Grow m_p by random 50-100 pixels
+            grown_mask = _dilate_mask(mask_bin, 50, 200)
+            # 2) Optional random augmentation mask constrained by m_p
+            rand_mask = _constrained_random_mask(mask_bin, fh, fw, aug_prob=0.7)
+            # 3) Final union mask
+            union_mask = np.clip(grown_mask | rand_mask, 0, 1).astype(np.uint8)
+            tgt_np = np.array(tgt_img)
+            # Helper: choose relighted image from generated_images with weights
+            def _choose_relight_image(rec, default_img, width, height):
+                gen_list = rec.get("generated_images") or []
+                # Build map mode -> path
+                mode_to_path = {}
+                for it in gen_list:
+                    try:
+                        mode = str(it.get("mode", "")).strip().lower()
+                        path = it.get("path")
+                    except Exception:
+                        continue
+                    if not mode or not path:
+                        continue
+                    mode_to_path[mode] = path
+                # Weighted selection among available modes
+                weighted_order = [
+                    ("grayscale", 0.5),
+                    ("low", 0.3),
+                    ("high", 0.2),
+                ]
+                # Filter to available
+                available = [(m, w) for (m, w) in weighted_order if m in mode_to_path]
+                chosen_path = None
+                if available:
+                    rnd = random.random()
+                    cum = 0.0
+                    total_w = sum(w for _, w in available)
+                    for m, w in available:
+                        cum += w / total_w
+                        if rnd <= cum:
+                            chosen_path = mode_to_path.get(m)
+                            break
+                    if chosen_path is None:
+                        chosen_path = mode_to_path.get(available[-1][0])
+                else:
+                    # Fallback to any provided path
+                    if mode_to_path:
+                        chosen_path = next(iter(mode_to_path.values()))
+                # Open chosen image
+                if chosen_path is not None:
+                    try:
+                        open_path = chosen_path
+                        # generated paths are typically absolute; if not, use as-is
+                        img = Image.open(open_path).convert("RGB").resize((width, height), resample=Image.BILINEAR)
+                        return img
+                    except Exception:
+                        pass
+                return default_img
+            # 5) Choose base image with probabilities:
+            # 20%: original, 20%: color augment(original), 60%: relight augment
+            rsel = random.random()
+            if rsel < 0.2:
+                base_img = foreground_source_img
+            elif rsel < 0.4:
+                base_img = _color_augment(foreground_source_img)
+            else:
+                base_img = _choose_relight_image(rec, foreground_source_img, fw, fh)
+            base_np = np.array(base_img)
+            # 5.1) Random perspective augmentation (20%): apply to foreground ROI (mask bbox) and its mask only
+            perspective_applied = False
+            roi_update = None
+            paste_mask_bool = mask_bin.astype(bool)
+            if random.random() < 0.5:
+                try:
+                    import cv2
+                    ys, xs = np.where(mask_bin > 0)
+                    if len(xs) > 0 and len(ys) > 0:
+                        x1, x2 = int(xs.min()), int(xs.max())
+                        y1, y2 = int(ys.min()), int(ys.max())
+                        if x2 > x1 and y2 > y1:
+                            roi = base_np[y1:y2 + 1, x1:x2 + 1]
+                            roi_mask = mask_bin[y1:y2 + 1, x1:x2 + 1]
+                            bh, bw = roi.shape[:2]
+                            # Random perturbation relative to ROI size
+                            max_ratio = random.uniform(0.1, 0.3)
+                            dx = bw * max_ratio
+                            dy = bh * max_ratio
+                            src = np.float32([[0, 0], [bw - 1, 0], [bw - 1, bh - 1], [0, bh - 1]])
+                            dst = np.float32([
+                                [np.clip(random.uniform(-dx, dx), 0, bw - 1), np.clip(random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(bw - 1 + random.uniform(-dx, dx), 0, bw - 1), np.clip(random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(bw - 1 + random.uniform(-dx, dx), 0, bw - 1), np.clip(bh - 1 + random.uniform(-dy, dy), 0, bh - 1)],
+                                [np.clip(random.uniform(-dx, dx), 0, bw - 1), np.clip(bh - 1 + random.uniform(-dy, dy), 0, bh - 1)],
+                            ])
+                            M = cv2.getPerspectiveTransform(src, dst)
+                            warped_roi = cv2.warpPerspective(roi, M, (bw, bh), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT101)
+                            warped_mask_roi = cv2.warpPerspective((roi_mask.astype(np.uint8) * 255), M, (bw, bh), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0) > 127
+                            # Build a fresh foreground canvas
+                            fore_np = np.zeros_like(base_np)
+                            h_warp, w_warp = warped_mask_roi.shape
+                            y2c = y1 + h_warp
+                            x2c = x1 + w_warp
+                            fore_np[y1:y2c, x1:x2c][warped_mask_roi] = warped_roi[warped_mask_roi]
+                            paste_mask_bool = np.zeros_like(mask_bin, dtype=bool)
+                            paste_mask_bool[y1:y2c, x1:x2c] = warped_mask_roi
+                            roi_update = (x1, y1, h_warp, w_warp, warped_mask_roi)
+                            perspective_applied = True
+                            base_np = fore_np
+                except Exception:
+                    perspective_applied = False
+                    paste_mask_bool = mask_bin.astype(bool)
+            # Optional: simulate cut-out foregrounds coming from different resolutions by
+            # downscaling the masked foreground region and upscaling back to original size.
+            # This introduces realistic blur/aliasing seen in real inpaint workflows.
+            if random.random() < 0.7:
+                ys, xs = np.where(mask_bin > 0)
+                if len(xs) > 0 and len(ys) > 0:
+                    x1, x2 = int(xs.min()), int(xs.max())
+                    y1, y2 = int(ys.min()), int(ys.max())
+                    # Ensure valid box
+                    if x2 > x1 and y2 > y1:
+                        crop = base_np[y1:y2 + 1, x1:x2 + 1]
+                        ch, cw = crop.shape[:2]
+                        scale = random.uniform(0.2, 0.9)
+                        dw = max(1, int(cw * scale))
+                        dh = max(1, int(ch * scale))
+                        try:
+                            small = Image.fromarray(crop.astype(np.uint8)).resize((dw, dh), Image.BICUBIC)
+                            back = small.resize((cw, ch), Image.BICUBIC)
+                            crop_blurred = np.array(back).astype(np.uint8)
+                            base_np[y1:y2 + 1, x1:x2 + 1] = crop_blurred
+                        except Exception:
+                            # Fallback: skip if resize fails
+                            pass
+            # 6) Build masked target using (possibly) updated union mask; then paste
+            union_mask_for_target = union_mask.copy()
+            if roi_update is not None:
+                rx, ry, rh, rw, warped_mask_roi = roi_update
+                # Ensure union covers the warped foreground area inside ROI using warped shape
+                um_roi = union_mask_for_target[ry:ry + rh, rx:rx + rw]
+                union_mask_for_target[ry:ry + rh, rx:rx + rw] = np.clip(um_roi | warped_mask_roi.astype(np.uint8), 0, 1)
+            masked_t_np = tgt_np.copy()
+            masked_t_np[union_mask_for_target.astype(bool)] = 255
+            composed_np = masked_t_np.copy()
+            m_fore = paste_mask_bool
+            composed_np[m_fore] = base_np[m_fore]
+            # 7) Build tensors
+            source_tensor = to_tensor_and_norm(Image.fromarray(composed_np.astype(np.uint8)))
+            mask_tensor = torch.from_numpy(union_mask.astype(np.float32)).unsqueeze(0)
+            # 8) Caption: prepend instruction, 20% keep only instruction
+            cap_orig = rec.get("prompt", "") or ""
+            cap = _prepend_caption(cap_orig)
+            if perspective_applied:
+                cap = f"{cap} Fix the perspective if necessary."
+            ids1, ids2 = _tokenize(tokenizers, cap)
+            return {
+                "source_pixel_values": source_tensor,
+                "pixel_values": target_tensor,
+                "token_ids_clip": ids1,
+                "token_ids_t5": ids2,
+                "mask_values": mask_tensor,
+            }
+    return SubjectsDataset(records, base_dir)
+def make_pexels_dataset_subjects(args, tokenizers, accelerator=None, base_dir=None):
+    """
+    Dataset for JSONL with fields:
+      - input_path: relative to base_dir (target image)
+      - output_path: relative to relight_base_dir (relighted image)
+      - final_size: {width, height} resize applied
+      - caption: text caption
+    Modified to use segmentation maps instead of raw_mask_path.
+    Randomly selects 2-5 segments from segmentation map, applies augmentation to each, and takes union.
+    This simulates multiple foreground objects being placed like a puzzle.
+    Each segment independently uses: 20% original, 20% color_augment, 60% relighted image.
+    Args:
+        base_dir: Base directory for resolving relative paths. If None, uses args.pexels_base_dir.
+    """
+    if base_dir is None:
+        base_dir = getattr(args, "pexels_base_dir", "/mnt/robby-b1/common/datasets")
+    relight_base_dir = getattr(args, "pexels_relight_base_dir", "/robby/share/Editing/lzc/data/relight_outputs")
+    seg_base_dir = getattr(args, "seg_base_dir", "/mnt/robby-b1/common/datasets/pexels-mask/20190515093182")
+    data_files = _resolve_jsonl(getattr(args, "pexels_data_jsonl", None))
+    file_paths = data_files.get("train", [])
+    records = []
+    for p in file_paths:
+        with open(p, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    try:
+                        obj = json.loads(line.rstrip(","))
+                    except Exception:
+                        continue
+                pruned = {
+                    "input_path": obj.get("input_path"),
+                    "output_path": obj.get("output_path"),
+                    "final_size": obj.get("final_size"),
+                    "caption": obj.get("caption"),
+                }
+                records.append(pruned)
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    class PexelsDataset(torch.utils.data.Dataset):
+        def __init__(self, hf_ds, base_dir, relight_base_dir, seg_base_dir):
+            self.ds = hf_ds
+            self.base_dir = base_dir
+            self.relight_base_dir = relight_base_dir
+            self.seg_base_dir = seg_base_dir
+        def __len__(self):
+            return len(self.ds)
+        def _extract_hash_from_filename(self, filename: str) -> str:
+            """Extract hash from input filename for segmentation map lookup."""
+            stem = os.path.splitext(os.path.basename(filename))[0]
+            if '_' in stem:
+                parts = stem.split('_')
+                return parts[-1]
+            return stem
+        def _build_segmap_path(self, input_filename: str) -> str:
+            """Build path to segmentation map from input filename."""
+            hash_id = self._extract_hash_from_filename(input_filename)
+            return os.path.join(self.seg_base_dir, f"{hash_id}_map.png")
+        def _load_segmap_uint32(self, seg_path: str):
+            """Load segmentation map as uint32 array."""
+            import cv2
+            try:
+                with Image.open(seg_path) as im:
+                    if im.mode == 'P':
+                        seg = np.array(im)
+                    elif im.mode in ('I;16', 'I', 'L'):
+                        seg = np.array(im)
+                    else:
+                        seg = np.array(im.convert('L'))
+            except Exception:
+                return None
+            if seg.ndim == 3:
+                seg = cv2.cvtColor(seg, cv2.COLOR_BGR2GRAY)
+            return seg.astype(np.uint32)
+        def _extract_multiple_segments(
+            self,
+            image_h: int,
+            image_w: int,
+            seg_path: str,
+            min_area_ratio: float = 0.02,
+            max_area_ratio: float = 0.4,
+        ):
+            """Extract 2-5 individual segment masks from segmentation map."""
+            import cv2
+            seg = self._load_segmap_uint32(seg_path)
+            if seg is None:
+                return []
+            if seg.shape != (image_h, image_w):
+                seg = cv2.resize(seg.astype(np.uint16), (image_w, image_h), interpolation=cv2.INTER_NEAREST).astype(np.uint32)
+            labels, counts = np.unique(seg, return_counts=True)
+            if labels.size == 0:
+                return []
+            # Exclude background label 0
+            bg_mask = labels == 0
+            labels = labels[~bg_mask]
+            counts = counts[~bg_mask]
+            if labels.size == 0:
+                return []
+            area = image_h * image_w
+            min_px = int(round(min_area_ratio * area))
+            max_px = int(round(max_area_ratio * area))
+            keep = (counts >= min_px) & (counts <= max_px)
+            cand_labels = labels[keep]
+            if cand_labels.size == 0:
+                return []
+            # Select 2-5 labels randomly
+            max_sel = min(5, cand_labels.size)
+            min_sel = min(2, cand_labels.size)
+            num_to_select = random.randint(min_sel, max_sel)
+            chosen = np.random.choice(cand_labels, size=num_to_select, replace=False)
+            # Create individual masks for each chosen label
+            individual_masks = []
+            for lab in chosen:
+                binm = (seg == int(lab)).astype(np.uint8)
+                # Apply opening operation to clean up mask
+                k = max(3, int(round(max(image_h, image_w) * 0.01)))
+                if k % 2 == 0:
+                    k += 1
+                kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))
+                eroded = cv2.erode(binm, kernel, iterations=1)
+                opened = cv2.dilate(eroded, kernel, iterations=1)
+                individual_masks.append(opened)
+            return individual_masks
+        def __getitem__(self, idx):
+            rec = self.ds[idx % len(self.ds)]
+            t_rel = rec.get("input_path", "")
+            r_rel = rec.get("output_path", "")
+            t_p = os.path.join(self.base_dir, t_rel)
+            relight_p = os.path.join(self.relight_base_dir, r_rel)
+            import cv2
+            tgt_img = Image.open(t_p).convert("RGB")
+            # Load relighted image, fallback to target if not available
+            try:
+                relighted_img = Image.open(relight_p).convert("RGB")
+            except Exception:
+                relighted_img = tgt_img.copy()
+            final_size = rec.get("final_size", {}) or {}
+            fw = int(final_size.get("width", tgt_img.width))
+            fh = int(final_size.get("height", tgt_img.height))
+            tgt_img = tgt_img.resize((fw, fh), resample=Image.BILINEAR)
+            relighted_img = relighted_img.resize((fw, fh), resample=Image.BILINEAR)
+            target_tensor = to_tensor_and_norm(tgt_img)
+            # Build segmentation map path and extract multiple segments
+            input_filename = os.path.basename(t_rel)
+            seg_path = self._build_segmap_path(input_filename)
+            individual_masks = self._extract_multiple_segments(fh, fw, seg_path)
+            if not individual_masks:
+                # Fallback: create empty mask (will be handled gracefully)
+                union_mask = np.zeros((fh, fw), dtype=np.uint8)
+                individual_masks = []
+            else:
+                # Apply augmentation to each segment mask and take union
+                augmented_masks = []
+                for seg_mask in individual_masks:
+                    # 1) Grow mask by random 50-200 pixels
+                    grown = _dilate_mask(seg_mask, 50, 200)
+                    # 2) Optional random augmentation mask constrained by this segment
+                    rand_mask = _constrained_random_mask(seg_mask, fh, fw, aug_prob=0.7)
+                    # 3) Union for this segment
+                    seg_union = np.clip(grown | rand_mask, 0, 1).astype(np.uint8)
+                    augmented_masks.append(seg_union)
+                # Take union of all augmented masks
+                union_mask = np.zeros((fh, fw), dtype=np.uint8)
+                for m in augmented_masks:
+                    union_mask = np.clip(union_mask | m, 0, 1).astype(np.uint8)
+            tgt_np = np.array(tgt_img)
+            # Build masked target first
+            masked_t_np = tgt_np.copy()
+            masked_t_np[union_mask.astype(bool)] = 255
+            composed_np = masked_t_np.copy()
+            # Process each segment independently with different augmentations
+            # This simulates multiple foreground objects from different sources
+            for seg_mask in individual_masks:
+                # 1) Choose source for this segment: 20% original, 20% color_augment, 60% relighted
+                r = random.random()
+                if r < 0.2:
+                    # Original image
+                    seg_source_img = tgt_img
+                else:
+                    seg_source_img = _color_augment(tgt_img)
+                # elif r < 0.4:
+                #     # Color augmentation
+                #     seg_source_img = _color_augment(tgt_img)
+                # else:
+                #     # Relighted image
+                #     seg_source_img = relighted_img
+                seg_source_np = np.array(seg_source_img)
+                # 2) Apply resolution augmentation to this segment's region
+                if random.random() < 0.7:
+                    ys, xs = np.where(seg_mask > 0)
+                    if len(xs) > 0 and len(ys) > 0:
+                        x1, x2 = int(xs.min()), int(xs.max())
+                        y1, y2 = int(ys.min()), int(ys.max())
+                        if x2 > x1 and y2 > y1:
+                            crop = seg_source_np[y1:y2 + 1, x1:x2 + 1]
+                            ch, cw = crop.shape[:2]
+                            scale = random.uniform(0.2, 0.9)
+                            dw = max(1, int(cw * scale))
+                            dh = max(1, int(ch * scale))
+                            try:
+                                small = Image.fromarray(crop.astype(np.uint8)).resize((dw, dh), Image.BICUBIC)
+                                back = small.resize((cw, ch), Image.BICUBIC)
+                                crop_blurred = np.array(back).astype(np.uint8)
+                                seg_source_np[y1:y2 + 1, x1:x2 + 1] = crop_blurred
+                            except Exception:
+                                pass
+                # 3) Paste this segment onto composed image
+                m_fore = seg_mask.astype(bool)
+                composed_np[m_fore] = seg_source_np[m_fore]
+            # Build tensors
+            source_tensor = to_tensor_and_norm(Image.fromarray(composed_np.astype(np.uint8)))
+            mask_tensor = torch.from_numpy(union_mask.astype(np.float32)).unsqueeze(0)
+            # Caption: prepend instruction
+            cap_orig = rec.get("caption", "") or ""
+            cap = _prepend_caption(cap_orig)
+            ids1, ids2 = _tokenize(tokenizers, cap)
+            return {
+                "source_pixel_values": source_tensor,
+                "pixel_values": target_tensor,
+                "token_ids_clip": ids1,
+                "token_ids_t5": ids2,
+                "mask_values": mask_tensor,
+            }
+    return PexelsDataset(records, base_dir, relight_base_dir, seg_base_dir)
+def make_mixed_dataset(args, tokenizers, interactive_jsonl_path=None, placement_jsonl_path=None,
+                       pexels_jsonl_path=None, interactive_base_dir=None, placement_base_dir=None,
+                       pexels_base_dir=None, interactive_weight=1.0, placement_weight=1.0,
+                       pexels_weight=1.0, accelerator=None):
+    """
+    Create a mixed dataset combining interactive, placement, and pexels datasets.
+    Args:
+        args: Arguments object with dataset configuration
+        tokenizers: Tuple of tokenizers for text encoding
+        interactive_jsonl_path: Path to interactive dataset JSONL (optional)
+        placement_jsonl_path: Path to placement dataset JSONL (optional)
+        pexels_jsonl_path: Path to pexels dataset JSONL (optional)
+        interactive_base_dir: Base directory for interactive dataset paths (optional)
+        placement_base_dir: Base directory for placement dataset paths (optional)
+        pexels_base_dir: Base directory for pexels dataset paths (optional)
+        interactive_weight: Sampling weight for interactive dataset (default: 1.0)
+        placement_weight: Sampling weight for placement dataset (default: 1.0)
+        pexels_weight: Sampling weight for pexels dataset (default: 1.0)
+        accelerator: Optional accelerator object
+    Returns:
+        Mixed dataset that samples from all provided datasets with specified weights
+    """
+    datasets = []
+    dataset_names = []
+    dataset_weights = []
+    # Create interactive dataset if path provided
+    if interactive_jsonl_path:
+        interactive_args = type('Args', (), {})()
+        for k, v in vars(args).items():
+            setattr(interactive_args, k, v)
+        interactive_args.train_data_jsonl = interactive_jsonl_path
+        if interactive_base_dir:
+            interactive_args.interactive_base_dir = interactive_base_dir
+        interactive_ds = make_interactive_dataset_subjects(interactive_args, tokenizers, accelerator)
+        datasets.append(interactive_ds)
+        dataset_names.append("interactive")
+        dataset_weights.append(interactive_weight)
+    # Create placement dataset if path provided
+    if placement_jsonl_path:
+        placement_args = type('Args', (), {})()
+        for k, v in vars(args).items():
+            setattr(placement_args, k, v)
+        placement_args.placement_data_jsonl = placement_jsonl_path
+        if placement_base_dir:
+            placement_args.placement_base_dir = placement_base_dir
+        placement_ds = make_placement_dataset_subjects(placement_args, tokenizers, accelerator)
+        datasets.append(placement_ds)
+        dataset_names.append("placement")
+        dataset_weights.append(placement_weight)
+    # Create pexels dataset if path provided
+    if pexels_jsonl_path:
+        pexels_args = type('Args', (), {})()
+        for k, v in vars(args).items():
+            setattr(pexels_args, k, v)
+        pexels_args.pexels_data_jsonl = pexels_jsonl_path
+        if pexels_base_dir:
+            pexels_args.pexels_base_dir = pexels_base_dir
+        pexels_ds = make_pexels_dataset_subjects(pexels_args, tokenizers, accelerator)
+        datasets.append(pexels_ds)
+        dataset_names.append("pexels")
+        dataset_weights.append(pexels_weight)
+    if not datasets:
+        raise ValueError("At least one dataset path must be provided")
+    if len(datasets) == 1:
+        return datasets[0]
+    # Mixed dataset class with balanced sampling (based on smallest dataset)
+    class MixedDataset(torch.utils.data.Dataset):
+        def __init__(self, datasets, dataset_names, dataset_weights):
+            self.datasets = datasets
+            self.dataset_names = dataset_names
+            self.lengths = [len(ds) for ds in datasets]
+            # Normalize weights
+            total_weight = sum(dataset_weights)
+            self.weights = [w / total_weight for w in dataset_weights]
+            # Calculate samples per dataset based on smallest dataset and weights
+            # Find the minimum weighted size
+            min_weighted_size = min(length / weight for length, weight in zip(self.lengths, dataset_weights))
+            # Each dataset contributes samples proportional to its weight, scaled by min_weighted_size
+            self.samples_per_dataset = [int(min_weighted_size * w) for w in dataset_weights]
+            self.total_length = sum(self.samples_per_dataset)
+            # Build cumulative sample counts for indexing
+            self.cumsum_samples = [0]
+            for count in self.samples_per_dataset:
+                self.cumsum_samples.append(self.cumsum_samples[-1] + count)
+            print(f"Balanced mixed dataset created:")
+            for i, name in enumerate(dataset_names):
+                print(f"  {name}: {self.lengths[i]} total, {self.samples_per_dataset[i]} per epoch")
+            print(f"  Total samples per epoch: {self.total_length}")
+        def __len__(self):
+            return self.total_length
+        def __getitem__(self, idx):
+            # Determine which dataset this idx belongs to
+            dataset_idx = 0
+            for i in range(len(self.cumsum_samples) - 1):
+                if self.cumsum_samples[i] <= idx < self.cumsum_samples[i + 1]:
+                    dataset_idx = i
+                    break
+            # Randomly sample from the selected dataset (enables different samples each epoch)
+            local_idx = random.randint(0, self.lengths[dataset_idx] - 1)
+            sample = self.datasets[dataset_idx][local_idx]
+            # Add dataset source information
+            sample["dataset_source"] = self.dataset_names[dataset_idx]
+            return sample
+    return MixedDataset(datasets, dataset_names, dataset_weights)
+def _run_test_mode(
+    interactive_jsonl: str = None,
+    placement_jsonl: str = None,
+    pexels_jsonl: str = None,
+    interactive_base_dir: str = None,
+    placement_base_dir: str = None,
+    pexels_base_dir: str = None,
+    pexels_relight_base_dir: str = None,
+    seg_base_dir: str = None,
+    interactive_weight: float = 1.0,
+    placement_weight: float = 1.0,
+    pexels_weight: float = 1.0,
+    output_dir: str = "test_output",
+    num_samples: int = 100
+):
+    """Test dataset by saving samples with source labels.
+    Args:
+        interactive_jsonl: Path to interactive dataset JSONL (optional)
+        placement_jsonl: Path to placement dataset JSONL (optional)
+        pexels_jsonl: Path to pexels dataset JSONL (optional)
+        interactive_base_dir: Base directory for interactive dataset
+        placement_base_dir: Base directory for placement dataset
+        pexels_base_dir: Base directory for pexels dataset
+        pexels_relight_base_dir: Base directory for pexels relighted images
+        seg_base_dir: Directory containing segmentation maps for pexels dataset
+        interactive_weight: Sampling weight for interactive dataset (default: 1.0)
+        placement_weight: Sampling weight for placement dataset (default: 1.0)
+        pexels_weight: Sampling weight for pexels dataset (default: 1.0)
+        output_dir: Output directory for test images
+        num_samples: Number of samples to save
+    """
+    if not interactive_jsonl and not placement_jsonl and not pexels_jsonl:
+        raise ValueError("At least one dataset path must be provided")
+    os.makedirs(output_dir, exist_ok=True)
+    # Create dummy tokenizers for testing
+    class DummyTokenizer:
+        def __call__(self, text, **kwargs):
+            class Result:
+                input_ids = torch.zeros(1, 77, dtype=torch.long)
+            return Result()
+    tokenizers = (DummyTokenizer(), DummyTokenizer())
+    # Create args object
+    class Args:
+        cond_size = 512
+    args = Args()
+    args.train_data_jsonl = interactive_jsonl
+    args.placement_data_jsonl = placement_jsonl
+    args.pexels_data_jsonl = pexels_jsonl
+    args.interactive_base_dir = interactive_base_dir
+    args.placement_base_dir = placement_base_dir
+    args.pexels_base_dir = pexels_base_dir
+    args.pexels_relight_base_dir = pexels_relight_base_dir if pexels_relight_base_dir else "/robby/share/Editing/lzc/data/relight_outputs"
+    args.seg_base_dir = seg_base_dir if seg_base_dir else "/mnt/robby-b1/common/datasets/pexels-mask/20190515093182"
+    # Create dataset (single or mixed)
+    try:
+        # Count how many datasets are provided
+        num_datasets = sum([bool(interactive_jsonl), bool(placement_jsonl), bool(pexels_jsonl)])
+        if num_datasets > 1:
+            dataset = make_mixed_dataset(
+                args, tokenizers,
+                interactive_jsonl_path=interactive_jsonl,
+                placement_jsonl_path=placement_jsonl,
+                pexels_jsonl_path=pexels_jsonl,
+                interactive_base_dir=args.interactive_base_dir,
+                placement_base_dir=args.placement_base_dir,
+                pexels_base_dir=args.pexels_base_dir,
+                interactive_weight=interactive_weight,
+                placement_weight=placement_weight,
+                pexels_weight=pexels_weight
+            )
+            print(f"Created mixed dataset with {len(dataset)} samples")
+            weights_str = []
+            if interactive_jsonl:
+                weights_str.append(f"Interactive: {interactive_weight:.2f}")
+            if placement_jsonl:
+                weights_str.append(f"Placement: {placement_weight:.2f}")
+            if pexels_jsonl:
+                weights_str.append(f"Pexels: {pexels_weight:.2f}")
+            print(f"Sampling weights - {', '.join(weights_str)}")
+        elif pexels_jsonl:
+            dataset = make_pexels_dataset_subjects(args, tokenizers, base_dir=pexels_base_dir)
+            print(f"Created pexels dataset with {len(dataset)} samples")
+        elif placement_jsonl:
+            dataset = make_placement_dataset_subjects(args, tokenizers, base_dir=args.placement_base_dir)
+            print(f"Created placement dataset with {len(dataset)} samples")
+        else:
+            dataset = make_interactive_dataset_subjects(args, tokenizers, base_dir=args.interactive_base_dir)
+            print(f"Created interactive dataset with {len(dataset)} samples")
+    except Exception as e:
+        print(f"Failed to create dataset: {e}")
+        import traceback
+        traceback.print_exc()
+        return
+    # Sample and save
+    saved = 0
+    counts = {}
+    for attempt in range(min(num_samples * 3, len(dataset))):
+        try:
+            idx = random.randint(0, len(dataset) - 1)
+            sample = dataset[idx]
+            source_name = sample.get("dataset_source", "single")
+            counts[source_name] = counts.get(source_name, 0) + 1
+            # Denormalize tensors from [-1, 1] to [0, 255]
+            source_np = ((sample["source_pixel_values"].permute(1, 2, 0).numpy() + 1.0) * 127.5).clip(0, 255).astype(np.uint8)
+            target_np = ((sample["pixel_values"].permute(1, 2, 0).numpy() + 1.0) * 127.5).clip(0, 255).astype(np.uint8)
+            # Save images
+            idx_str = f"{saved:05d}"
+            Image.fromarray(source_np).save(os.path.join(output_dir, f"{idx_str}_{source_name}_source.jpg"))
+            Image.fromarray(target_np).save(os.path.join(output_dir, f"{idx_str}_{source_name}_target.jpg"))
+            saved += 1
+            if saved % 10 == 0:
+                print(f"Saved {saved}/{num_samples} samples - {counts}")
+            if saved >= num_samples:
+                break
+        except Exception as e:
+            print(f"Failed to process sample: {e}")
+            continue
+    print(f"\nTest complete. Saved {saved} samples to {output_dir}")
+    print(f"Distribution: {counts}")
+def _parse_test_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Test visualization for Kontext datasets")
+    parser.add_argument("--interactive_jsonl", type=str, default="/robby/share/Editing/lzc/HOI_v1/final_metadata.jsonl",
+                        help="Path to interactive dataset JSONL")
+    parser.add_argument("--placement_jsonl", type=str, default="/robby/share/Editing/lzc/subject_placement/metadata_relight.jsonl",
+                        help="Path to placement dataset JSONL")
+    parser.add_argument("--pexels_jsonl", type=str, default=None,
+                        help="Path to pexels dataset JSONL")
+    parser.add_argument("--interactive_base_dir", type=str, default="/robby/share/Editing/lzc/HOI_v1",
+                        help="Base directory for interactive dataset")
+    parser.add_argument("--placement_base_dir", type=str, default=None,
+                        help="Base directory for placement dataset")
+    parser.add_argument("--pexels_base_dir", type=str, default=None,
+                        help="Base directory for pexels dataset")
+    parser.add_argument("--pexels_relight_base_dir", type=str, default="/robby/share/Editing/lzc/data/relight_outputs",
+                        help="Base directory for pexels relighted images")
+    parser.add_argument("--seg_base_dir", type=str, default=None,
+                        help="Directory containing segmentation maps for pexels dataset")
+    parser.add_argument("--interactive_weight", type=float, default=1.0,
+                        help="Sampling weight for interactive dataset (default: 1.0)")
+    parser.add_argument("--placement_weight", type=float, default=1.0,
+                        help="Sampling weight for placement dataset (default: 1.0)")
+    parser.add_argument("--pexels_weight", type=float, default=0,
+                        help="Sampling weight for pexels dataset (default: 1.0)")
+    parser.add_argument("--output_dir", type=str, default="visualize_output",
+                        help="Output directory to save pairs")
+    parser.add_argument("--num_samples", type=int, default=100,
+                        help="Number of pairs to save")
+    # Legacy arguments
+    parser.add_argument("--test_jsonl", type=str, default=None,
+                        help="Legacy: Path to JSONL (uses as interactive_jsonl)")
+    parser.add_argument("--base_dir", type=str, default=None,
+                        help="Legacy: Base directory (uses as interactive_base_dir)")
+    return parser.parse_args()
+if __name__ == "__main__":
+    try:
+        args = _parse_test_args()
+        # Handle legacy args
+        interactive_jsonl = args.interactive_jsonl or args.test_jsonl
+        interactive_base_dir = args.interactive_base_dir or args.base_dir
+        _run_test_mode(
+            interactive_jsonl=interactive_jsonl,
+            placement_jsonl=args.placement_jsonl,
+            pexels_jsonl=args.pexels_jsonl,
+            interactive_base_dir=interactive_base_dir,
+            placement_base_dir=args.placement_base_dir,
+            pexels_base_dir=args.pexels_base_dir,
+            pexels_relight_base_dir=args.pexels_relight_base_dir,
+            seg_base_dir=args.seg_base_dir,
+            interactive_weight=args.interactive_weight,
+            placement_weight=args.placement_weight,
+            pexels_weight=args.pexels_weight,
+            output_dir=args.output_dir,
+            num_samples=args.num_samples
+        )
+    except SystemExit:
+        # Allow import usage without triggering test mode
+        pass

train/src/jsonl_datasets_kontext_local.py ADDED Viewed

	@@ -0,0 +1,312 @@

+from PIL import Image
+from datasets import Dataset
+from torchvision import transforms
+import random
+import torch
+import os
+from .pipeline_flux_kontext_control import PREFERRED_KONTEXT_RESOLUTIONS
+from .jsonl_datasets_kontext import make_train_dataset_inpaint_mask
+import numpy as np
+import json
+from .generate_diff_mask import generate_final_difference_mask, align_images
+Image.MAX_IMAGE_PIXELS = None
+BLEND_PIXEL_VALUES = True
+def multiple_16(num: float):
+    return int(round(num / 16) * 16)
+def choose_kontext_resolution_from_wh(width: int, height: int):
+    aspect_ratio = width / max(1, height)
+    _, best_w, best_h = min(
+        (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+    )
+    return best_w, best_h
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    if examples[0].get("source_pixel_values") is not None:
+        source_pixel_values = torch.stack([example["source_pixel_values"] for example in examples])
+        source_pixel_values = source_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        source_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([example["token_ids_clip"] for example in examples])
+    token_ids_t5 = torch.stack([example["token_ids_t5"] for example in examples])
+    mask_values = None
+    if examples[0].get("mask_values") is not None:
+        mask_values = torch.stack([example["mask_values"] for example in examples])
+        mask_values = mask_values.to(memory_format=torch.contiguous_format).float()
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "source_pixel_values": source_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+        "mask_values": mask_values,
+    }
+# New dataset for local_edits JSON mapping with on-the-fly diff-mask generation
+def make_train_dataset_local_edits(args, tokenizers, accelerator=None):
+    # Read JSON entries
+    with open(args.local_edits_json, "r", encoding="utf-8") as f:
+        entries = json.load(f)
+    samples = []
+    for item in entries:
+        rel_path = item.get("path", "")
+        local_edits = item.get("local_edits", []) or []
+        if not rel_path or not local_edits:
+            continue
+        base_name = os.path.basename(rel_path)
+        prefix = os.path.splitext(base_name)[0]
+        group_dir = os.path.basename(os.path.dirname(rel_path))
+        gid_int = None
+        try:
+            gid_int = int(group_dir)
+        except Exception:
+            try:
+                digits = "".join([ch for ch in group_dir if ch.isdigit()])
+                gid_int = int(digits) if digits else None
+            except Exception:
+                gid_int = None
+        group_str = group_dir  # e.g., "0139" from the JSON path segment
+        # Resolve source/target directories strictly as base/<0139>
+        src_dir_candidates = [os.path.join(args.source_frames_dir, group_str)]
+        tgt_dir_candidates = [os.path.join(args.target_frames_dir, group_str)]
+        src_dir = next((d for d in src_dir_candidates if d and os.path.isdir(d)), None)
+        tgt_dir = next((d for d in tgt_dir_candidates if d and os.path.isdir(d)), None)
+        if src_dir is None or tgt_dir is None:
+            continue
+        src_path = os.path.join(src_dir, f"{prefix}.png")
+        for idx, prompt in enumerate(local_edits, start=1):
+            tgt_path = os.path.join(tgt_dir, f"{prefix}_{idx}.png")
+            mask_path = os.path.join(args.masks_dir, group_str, f"{prefix}_{idx}.png")
+            if not (os.path.exists(src_path) and os.path.exists(tgt_path) and os.path.exists(mask_path)):
+                continue
+            samples.append({
+                "source_image": src_path,
+                "target_image": tgt_path,
+                "mask_image": mask_path,
+                "prompt": prompt,
+            })
+    size = args.cond_size
+    to_tensor_and_norm = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    cond_train_transforms = transforms.Compose(
+        [
+            transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    tokenizer_clip = tokenizers[0]
+    tokenizer_t5 = tokenizers[1]
+    def tokenize_prompt_single(caption: str):
+        text_inputs_clip = tokenizer_clip(
+            [caption],
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids_1 = text_inputs_clip.input_ids[0]
+        text_inputs_t5 = tokenizer_t5(
+            [caption],
+            padding="max_length",
+            max_length=128,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids_2 = text_inputs_t5.input_ids[0]
+        return text_input_ids_1, text_input_ids_2
+    class LocalEditsDataset(torch.utils.data.Dataset):
+        def __init__(self, samples_ls):
+            self.samples = samples_ls
+        def __len__(self):
+            return len(self.samples)
+        def __getitem__(self, idx):
+            sample = self.samples[idx]
+            s_p = sample["source_image"]
+            t_p = sample["target_image"]
+            m_p = sample["mask_image"]
+            cap = sample["prompt"]
+            rr = random.randint(10, 20)
+            ri = random.randint(3, 5)
+            import cv2
+            mask_loaded = cv2.imread(m_p, cv2.IMREAD_GRAYSCALE)
+            if mask_loaded is None:
+                raise ValueError("mask load failed")
+            mask = mask_loaded.copy()
+            # Pre-expand mask by a fixed number of pixels before any random expansion
+            # Uses a cross-shaped kernel when tapered_corners is True to emulate "tapered" growth
+            pre_expand_px = int(getattr(args, "pre_expand_mask_px", 50))
+            pre_expand_tapered = bool(getattr(args, "pre_expand_tapered_corners", True))
+            if pre_expand_px != 0:
+                c = 0 if pre_expand_tapered else 1
+                pre_kernel = np.array([[c, 1, c],
+                                       [1, 1, 1],
+                                       [c, 1, c]], dtype=np.uint8)
+                if pre_expand_px > 0:
+                    mask = cv2.dilate(mask, pre_kernel, iterations=pre_expand_px)
+                else:
+                    mask = cv2.erode(mask, pre_kernel, iterations=abs(pre_expand_px))
+            if rr > 0 and ri > 0:
+                ksize = max(1, 2 * int(rr) + 1)
+                kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (ksize, ksize))
+                for _ in range(max(1, ri)):
+                    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+                    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+            src_aligned, tgt_aligned = align_images(s_p, t_p)
+            best_w, best_h = choose_kontext_resolution_from_wh(tgt_aligned.width, tgt_aligned.height)
+            final_img_rs = tgt_aligned.resize((best_w, best_h), resample=Image.BILINEAR)
+            raw_img_rs = src_aligned.resize((best_w, best_h), resample=Image.BILINEAR)
+            target_tensor = to_tensor_and_norm(final_img_rs)
+            source_tensor = to_tensor_and_norm(raw_img_rs)
+            mask_img = Image.fromarray(mask.astype(np.uint8)).convert("L")
+            if mask_img.size != src_aligned.size:
+                mask_img = mask_img.resize(src_aligned.size, Image.NEAREST)
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            inv_mask = (1 - mask_bin).astype(np.uint8)
+            src_np = np.array(src_aligned)
+            masked_raw_np = src_np * inv_mask[..., None]
+            masked_raw_img = Image.fromarray(masked_raw_np.astype(np.uint8))
+            cond_tensor = cond_train_transforms(masked_raw_img)
+            # Prepare mask_values tensor at training resolution (best_w, best_h)
+            mask_img_rs = mask_img.resize((best_w, best_h), Image.NEAREST)
+            mask_np_rs = np.array(mask_img_rs)
+            mask_bin_rs = (mask_np_rs > 127).astype(np.float32)
+            mask_tensor = torch.from_numpy(mask_bin_rs).unsqueeze(0)  # [1, H, W]
+            ids1, ids2 = tokenize_prompt_single(cap if isinstance(cap, str) else "")
+            # Optionally blend target and source using a blurred mask, controlled by args
+            if getattr(args, "blend_pixel_values", BLEND_PIXEL_VALUES):
+                blend_kernel = int(getattr(args, "blend_kernel", 21))
+                if blend_kernel % 2 == 0:
+                    blend_kernel += 1
+                blend_sigma = float(getattr(args, "blend_sigma", 10.0))
+                gb = transforms.GaussianBlur(kernel_size=(blend_kernel, blend_kernel), sigma=(blend_sigma, blend_sigma))
+                # mask_tensor: [1, H, W] in [0,1]
+                blurred_mask = gb(mask_tensor)  # [1, H, W]
+                # Expand to 3 channels to match image tensors
+                blurred_mask_3c = blurred_mask.expand(target_tensor.shape[0], -1, -1)  # [3, H, W]
+                # Blend in normalized space (both tensors already normalized to [-1, 1])
+                target_tensor = (source_tensor * (1.0 - blurred_mask_3c)) + (target_tensor * blurred_mask_3c)
+                target_tensor = target_tensor.clamp(-1.0, 1.0)
+            return {
+                "source_pixel_values": source_tensor,
+                "pixel_values": target_tensor,
+                "cond_pixel_values": cond_tensor,
+                "token_ids_clip": ids1,
+                "token_ids_t5": ids2,
+                "mask_values": mask_tensor,
+            }
+    return LocalEditsDataset(samples)
+class BalancedMixDataset(torch.utils.data.Dataset):
+    """
+    A wrapper dataset that mixes two datasets with a configurable ratio.
+    ratio_b_per_a defines how many samples from dataset_b for each sample from dataset_a:
+      - 0   => only dataset_a (local edits)
+      - 1   => 1:1 mix (default)
+      - 2   => 1:2 mix (A:B)
+      - any float supported (e.g., 0.5 => 2:1 mix)
+    """
+    def __init__(self, dataset_a, dataset_b, ratio_b_per_a: float = 1.0):
+        self.dataset_a = dataset_a
+        self.dataset_b = dataset_b
+        self.ratio_b_per_a = max(0.0, float(ratio_b_per_a))
+        len_a = len(dataset_a)
+        len_b = len(dataset_b)
+        # If ratio is 0, use all of dataset_a only
+        if self.ratio_b_per_a == 0 or len_b == 0:
+            a_indices = list(range(len_a))
+            random.shuffle(a_indices)
+            self.mapping = [(0, i) for i in a_indices]
+            return
+        # Determine how many we can draw without replacement
+        # n_a limited by A size and B availability according to ratio
+        n_a_by_ratio = int(len_b / self.ratio_b_per_a)
+        n_a = min(len_a, max(1, n_a_by_ratio))
+        n_b = min(len_b, max(1, int(round(n_a * self.ratio_b_per_a))))
+        a_indices = list(range(len_a))
+        b_indices = list(range(len_b))
+        random.shuffle(a_indices)
+        random.shuffle(b_indices)
+        a_indices = a_indices[: n_a]
+        b_indices = b_indices[: n_b]
+        mixed = [(0, i) for i in a_indices] + [(1, i) for i in b_indices]
+        random.shuffle(mixed)
+        self.mapping = mixed
+    def __len__(self):
+        return len(self.mapping)
+    def __getitem__(self, idx):
+        which, real_idx = self.mapping[idx]
+        if which == 0:
+            return self.dataset_a[real_idx]
+        else:
+            return self.dataset_b[real_idx]
+def make_train_dataset_mixed(args, tokenizers, accelerator=None):
+    """
+    Create a mixed dataset from:
+      - Local edits dataset (this file)
+      - Inpaint-mask JSONL dataset (jsonl_datasets_kontext.make_train_dataset_inpaint_mask)
+    Ratio control via args.mix_ratio (float):
+      - 0   => only local edits dataset
+      - 1   => 1:1 mix (local:inpaint)
+      - 2   => 1:2 mix, etc.
+    Requirements:
+      - args.local_edits_json and related dirs must be set for local edits
+      - args.train_data_dir must be set for the JSONL inpaint dataset
+    """
+    ds_local = make_train_dataset_local_edits(args, tokenizers, accelerator)
+    ds_inpaint = make_train_dataset_inpaint_mask(args, tokenizers, accelerator)
+    mix_ratio = getattr(args, "mix_ratio", 1.0)
+    return BalancedMixDataset(ds_local, ds_inpaint, ratio_b_per_a=mix_ratio)

train/src/layers.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from diffusers.models.attention_processor import Attention
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        cond_width=512,
+        cond_height=512,
+        number=0,
+        n_loras=1
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+        self.cond_height = cond_height
+        self.cond_width = cond_width
+        self.number = number
+        self.n_loras = n_loras
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        #### img condition
+        batch_size = hidden_states.shape[0]
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        shape = (batch_size, hidden_states.shape[1], 3072)
+        mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
+        mask[:, :block_size+self.number*cond_size, :] = 0
+        mask[:, block_size+(self.number+1)*cond_size:, :] = 0
+        hidden_states = mask * hidden_states
+        ####
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class MultiSingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False,
+    ) -> torch.FloatTensor:
+        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        scaled_cond_size = cond_size
+        scaled_block_size = block_size
+        scaled_seq_len = query.shape[2]
+        num_cond_blocks = self.n_loras
+        # mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+        # mask[ :scaled_block_size, :] = 0  # First block_size row
+        # for i in range(num_cond_blocks):
+        #     start = i * scaled_cond_size + scaled_block_size
+        #     end = (i + 1) * scaled_cond_size + scaled_block_size
+        #     mask[start:end, start:end] = 0  # Diagonal blocks
+        # mask = mask * -1e20
+        # mask = mask.to(query.dtype)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, : block_size,:]
+        return hidden_states if not use_cond else (hidden_states, cond_hidden_states)
+class MultiDoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.cond_width = cond_width
+        self.cond_height = cond_height
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(dim, dim, ranks[i],network_alphas[i], device=device, dtype=dtype, cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
+        block_size =  hidden_states.shape[1] - cond_size * self.n_loras
+        scaled_cond_size = cond_size
+        scaled_seq_len = query.shape[2]
+        scaled_block_size = scaled_seq_len - cond_size * self.n_loras
+        num_cond_blocks = self.n_loras
+        # mask = torch.ones((scaled_seq_len, scaled_seq_len), device=hidden_states.device)
+        # mask[ :scaled_block_size, :] = 0  # First block_size row
+        # for i in range(num_cond_blocks):
+        #     start = i * scaled_cond_size + scaled_block_size
+        #     end = (i + 1) * scaled_cond_size + scaled_block_size
+        #     mask[start:end, start:end] = 0  # Diagonal blocks
+        # mask = mask * -1e20
+        # mask = mask.to(query.dtype)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        cond_hidden_states = hidden_states[:, block_size:,:]
+        hidden_states = hidden_states[:, :block_size,:]
+        return (hidden_states, encoder_hidden_states, cond_hidden_states) if use_cond else (encoder_hidden_states, hidden_states)

train/src/lora_helper.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from safetensors import safe_open
+import re
+import torch
+from .layers import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+device = "cuda"
+def load_safetensors(path):
+    tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+def get_lora_rank(checkpoint):
+    for k in checkpoint.keys():
+        if k.endswith(".down.weight"):
+            return checkpoint[k].shape[0]
+def load_checkpoint(local_path):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    return checkpoint
+def update_model_with_lora(checkpoint, lora_weights, transformer, cond_size):
+        number = len(lora_weights)
+        ranks = [get_lora_rank(checkpoint) for _ in range(number)]
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size):
+        ck_number = len(checkpoints)
+        cond_lora_number = [len(ls) for ls in lora_weights]
+        cond_number = sum(cond_lora_number)
+        ranks = [get_lora_rank(checkpoint) for checkpoint in checkpoints]
+        multi_lora_weight = []
+        for ls in lora_weights:
+            for n in ls:
+                multi_lora_weight.append(n)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].proj_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].proj_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def set_single_lora(transformer, local_path, lora_weights=[], cond_size=512):
+    checkpoint = load_checkpoint(local_path)
+    update_model_with_lora(checkpoint, lora_weights, transformer, cond_size)
+def set_multi_lora(transformer, local_paths, lora_weights=[[]], cond_size=512):
+    checkpoints = [load_checkpoint(local_path) for local_path in local_paths]
+    update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size)
+def unset_lora(transformer):
+    lora_attn_procs = {}
+    for name, attn_processor in transformer.attn_processors.items():
+        lora_attn_procs[name] = FluxAttnProcessor2_0()
+    transformer.set_attn_processor(lora_attn_procs)
+'''
+unset_lora(pipe.transformer)
+lora_path = "./lora.safetensors"
+lora_weights = [1, 1]
+set_lora(pipe.transformer, local_path=lora_path, lora_weights=lora_weights, cond_size=512)
+'''

train/src/masks_integrated.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import math
+import random
+import logging
+from enum import Enum
+import cv2
+import numpy as np
+import random
+LOGGER = logging.getLogger(__name__)
+class LinearRamp:
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+class DrawMethod(Enum):
+    LINE = 'line'
+    CIRCLE = 'circle'
+    SQUARE = 'square'
+def make_random_irregular_mask(shape, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10,
+                               draw_method=DrawMethod.LINE):
+    """生成不规则mask - 基于角度和长度的线条"""
+    draw_method = DrawMethod(draw_method)
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        start_x = np.random.randint(width)
+        start_y = np.random.randint(height)
+        for j in range(1 + np.random.randint(5)):
+            angle = 0.01 + np.random.randint(max_angle)
+            if i % 2 == 0:
+                angle = 2 * 3.1415926 - angle
+            length = 10 + np.random.randint(max_len)
+            brush_w = 5 + np.random.randint(max_width)
+            end_x = np.clip((start_x + length * np.sin(angle)).astype(np.int32), 0, width)
+            end_y = np.clip((start_y + length * np.cos(angle)).astype(np.int32), 0, height)
+            if draw_method == DrawMethod.LINE:
+                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, brush_w)
+            elif draw_method == DrawMethod.CIRCLE:
+                cv2.circle(mask, (start_x, start_y), radius=brush_w, color=1., thickness=-1)
+            elif draw_method == DrawMethod.SQUARE:
+                radius = brush_w // 2
+                mask[start_y - radius:start_y + radius, start_x - radius:start_x + radius] = 1
+            start_x, start_y = end_x, end_y
+    return mask[None, ...]
+def make_random_rectangle_mask(shape, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3):
+    """生成随机矩形mask"""
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        box_width = np.random.randint(bbox_min_size, bbox_max_size)
+        box_height = np.random.randint(bbox_min_size, bbox_max_size)
+        start_x = np.random.randint(margin, width - margin - box_width + 1)
+        start_y = np.random.randint(margin, height - margin - box_height + 1)
+        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
+    return mask[None, ...]
+def make_random_superres_mask(shape, min_step=2, max_step=4, min_width=1, max_width=3):
+    """生成超分辨率风格的规则网格mask"""
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    step_x = np.random.randint(min_step, max_step + 1)
+    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
+    offset_x = np.random.randint(0, step_x)
+    step_y = np.random.randint(min_step, max_step + 1)
+    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
+    offset_y = np.random.randint(0, step_y)
+    for dy in range(width_y):
+        mask[offset_y + dy::step_y] = 1
+    for dx in range(width_x):
+        mask[:, offset_x + dx::step_x] = 1
+    return mask[None, ...]
+def make_brush_stroke_mask(shape, num_strokes_range=(1, 5), stroke_width_range=(5, 30),
+                          max_offset=50, num_points_range=(5, 15)):
+    """生成笔刷描边样式的mask - 基于随机偏移的连续线条"""
+    num_strokes = random.randint(*num_strokes_range)
+    height, width = shape
+    mask = np.zeros((height, width), dtype=np.float32)
+    for _ in range(num_strokes):
+        # 随机起点
+        start_x = random.randint(0, width)
+        start_y = random.randint(0, height)
+        # 随机描边参数
+        num_points = random.randint(*num_points_range)
+        stroke_width = random.randint(*stroke_width_range)
+        points = [(start_x, start_y)]
+        # 生成连续的点
+        for i in range(num_points):
+            prev_x, prev_y = points[-1]
+            # 添加随机偏移
+            dx = random.randint(-max_offset, max_offset)
+            dy = random.randint(-max_offset, max_offset)
+            new_x = max(0, min(width, prev_x + dx))
+            new_y = max(0, min(height, prev_y + dy))
+            points.append((new_x, new_y))
+        # 绘制描边
+        for i in range(len(points) - 1):
+            cv2.line(mask, points[i], points[i+1], 1.0, stroke_width)
+    return mask[None, ...]
+class RandomIrregularMaskGenerator:
+    """不规则mask生成器"""
+    def __init__(self, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, ramp_kwargs=None,
+                 draw_method=DrawMethod.LINE):
+        self.max_angle = max_angle
+        self.max_len = max_len
+        self.max_width = max_width
+        self.min_times = min_times
+        self.max_times = max_times
+        self.draw_method = draw_method
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_max_len = int(max(1, self.max_len * coef))
+        cur_max_width = int(max(1, self.max_width * coef))
+        cur_max_times = int(self.min_times + 1 + (self.max_times - self.min_times) * coef)
+        return make_random_irregular_mask(img.shape[1:], max_angle=self.max_angle, max_len=cur_max_len,
+                                          max_width=cur_max_width, min_times=self.min_times, max_times=cur_max_times,
+                                          draw_method=self.draw_method)
+class RandomRectangleMaskGenerator:
+    """矩形mask生成器"""
+    def __init__(self, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3, ramp_kwargs=None):
+        self.margin = margin
+        self.bbox_min_size = bbox_min_size
+        self.bbox_max_size = bbox_max_size
+        self.min_times = min_times
+        self.max_times = max_times
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_bbox_max_size = int(self.bbox_min_size + 1 + (self.bbox_max_size - self.bbox_min_size) * coef)
+        cur_max_times = int(self.min_times + (self.max_times - self.min_times) * coef)
+        return make_random_rectangle_mask(img.shape[1:], margin=self.margin, bbox_min_size=self.bbox_min_size,
+                                          bbox_max_size=cur_bbox_max_size, min_times=self.min_times,
+                                          max_times=cur_max_times)
+class RandomSuperresMaskGenerator:
+    """超分辨率mask生成器"""
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def __call__(self, img, iter_i=None):
+        return make_random_superres_mask(img.shape[1:], **self.kwargs)
+class BrushStrokeMaskGenerator:
+    """笔刷描边mask生成器"""
+    def __init__(self, num_strokes_range=(1, 5), stroke_width_range=(5, 30),
+                 max_offset=50, num_points_range=(5, 15), ramp_kwargs=None):
+        self.num_strokes_range = num_strokes_range
+        self.stroke_width_range = stroke_width_range
+        self.max_offset = max_offset
+        self.num_points_range = num_points_range
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_num_strokes = int(max(1, self.num_strokes_range[1] * coef))
+        cur_stroke_width_range = (
+            int(max(1, self.stroke_width_range[0] * coef)),
+            int(max(1, self.stroke_width_range[1] * coef))
+        )
+        return make_brush_stroke_mask(
+            img.shape[1:],
+            num_strokes_range=(cur_num_strokes, cur_num_strokes),
+            stroke_width_range=cur_stroke_width_range,
+            max_offset=self.max_offset,
+            num_points_range=self.num_points_range
+        )
+class DumbAreaMaskGenerator:
+    """简单区域mask生成器"""
+    min_ratio = 0.1
+    max_ratio = 0.35
+    default_ratio = 0.225
+    def __init__(self, is_training):
+        #Parameters:
+        #    is_training(bool): If true - random rectangular mask, if false - central square mask
+        self.is_training = is_training
+    def _random_vector(self, dimension):
+        if self.is_training:
+            lower_limit = math.sqrt(self.min_ratio)
+            upper_limit = math.sqrt(self.max_ratio)
+            mask_side = round((random.random() * (upper_limit - lower_limit) + lower_limit) * dimension)
+            u = random.randint(0, dimension-mask_side-1)
+            v = u+mask_side
+        else:
+            margin = (math.sqrt(self.default_ratio) / 2) * dimension
+            u = round(dimension/2 - margin)
+            v = round(dimension/2 + margin)
+        return u, v
+    def __call__(self, img, iter_i=None, raw_image=None):
+        c, height, width = img.shape
+        mask = np.zeros((height, width), np.float32)
+        x1, x2 = self._random_vector(width)
+        y1, y2 = self._random_vector(height)
+        mask[x1:x2, y1:y2] = 1
+        return mask[None, ...]
+class IntegratedMaskGenerator:
+    """集成的mask生成器 - 支持多种mask类型混合"""
+    def __init__(self, irregular_proba=1/4, irregular_kwargs=None,
+                 box_proba=1/4, box_kwargs=None,
+                 segm_proba=1/4, segm_kwargs=None,
+                 brush_stroke_proba=1/4, brush_stroke_kwargs=None,
+                 superres_proba=0, superres_kwargs=None,
+                 squares_proba=0, squares_kwargs=None,
+                 invert_proba=0):
+        self.probas = []
+        self.gens = []
+        if irregular_proba > 0:
+            self.probas.append(irregular_proba)
+            if irregular_kwargs is None:
+                irregular_kwargs = {}
+            else:
+                irregular_kwargs = dict(irregular_kwargs)
+            irregular_kwargs['draw_method'] = DrawMethod.LINE
+            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
+        if box_proba > 0:
+            self.probas.append(box_proba)
+            if box_kwargs is None:
+                box_kwargs = {}
+            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
+        if brush_stroke_proba > 0:
+            self.probas.append(brush_stroke_proba)
+            if brush_stroke_kwargs is None:
+                brush_stroke_kwargs = {}
+            self.gens.append(BrushStrokeMaskGenerator(**brush_stroke_kwargs))
+        if superres_proba > 0:
+            self.probas.append(superres_proba)
+            if superres_kwargs is None:
+                superres_kwargs = {}
+            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
+        if squares_proba > 0:
+            self.probas.append(squares_proba)
+            if squares_kwargs is None:
+                squares_kwargs = {}
+            else:
+                squares_kwargs = dict(squares_kwargs)
+            squares_kwargs['draw_method'] = DrawMethod.SQUARE
+            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
+        self.probas = np.array(self.probas, dtype='float32')
+        self.probas /= self.probas.sum()
+        self.invert_proba = invert_proba
+    def __call__(self, img, iter_i=None, raw_image=None):
+        kind = np.random.choice(len(self.probas), p=self.probas)
+        gen = self.gens[kind]
+        result = gen(img, iter_i=iter_i, raw_image=raw_image)
+        if self.invert_proba > 0 and random.random() < self.invert_proba:
+            result = 1 - result
+        return result
+def get_mask_generator(kind, kwargs):
+    """获取mask生成器的工厂函数"""
+    if kind is None:
+        kind = "integrated"
+    if kwargs is None:
+        kwargs = {}
+    if kind == "integrated":
+        cl = IntegratedMaskGenerator
+    elif kind == "irregular":
+        cl = RandomIrregularMaskGenerator
+    elif kind == "rectangle":
+        cl = RandomRectangleMaskGenerator
+    elif kind == "brush_stroke":
+        cl = BrushStrokeMaskGenerator
+    elif kind == "superres":
+        cl = RandomSuperresMaskGenerator
+    elif kind == "dumb":
+        cl = DumbAreaMaskGenerator
+    else:
+        raise NotImplementedError(f"No such generator kind = {kind}")
+    return cl(**kwargs)

train/src/pipeline_flux_kontext_control.py ADDED Viewed

	@@ -0,0 +1,1009 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from .transformer_flux import FluxTransformer2DModel
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils  import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from torchvision.transforms.functional import pad
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height, width, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height, device=device)[:, None]  # y
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width, device=device)[None, :]  # x
+    return latent_image_ids
+def prepare_latent_subject_ids(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height, width, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height, device=device)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width, device=device)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def resize_position_encoding(
+    batch_size, original_height, original_width, target_height, target_width, device, dtype
+):
+    latent_image_ids = prepare_latent_image_ids_(original_height // 2, original_width // 2, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    scale_h = original_height / target_height
+    scale_w = original_width / target_width
+    latent_image_ids_resized = torch.zeros(target_height // 2, target_width // 2, 3, device=device, dtype=dtype)
+    latent_image_ids_resized[..., 1] = (
+        latent_image_ids_resized[..., 1] + torch.arange(target_height // 2, device=device)[:, None] * scale_h
+    )
+    latent_image_ids_resized[..., 2] = (
+        latent_image_ids_resized[..., 2] + torch.arange(target_width // 2, device=device)[None, :] * scale_w
+    )
+    cond_latent_image_id_height, cond_latent_image_id_width, cond_latent_image_id_channels = (
+        latent_image_ids_resized.shape
+    )
+    cond_latent_image_ids = latent_image_ids_resized.reshape(
+        cond_latent_image_id_height * cond_latent_image_id_width, cond_latent_image_id_channels
+    )
+    return latent_image_ids, cond_latent_image_ids
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class FluxKontextControlPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    The Flux Kontext pipeline for image-to-image and text-to-image generation with EasyControl.
+    Reference: https://bfl.ai/announcements/flux-1-kontext-dev
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=None,
+            feature_extractor=None,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are packed into 2x2 patches, so use VAE factor multiplied by patch size for image processing
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+        self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
+        # EasyControl: cache multiple control LoRA processors
+        self.control_lora_processors: Dict[str, Dict[str, Any]] = {}
+        self.control_lora_cond_sizes: Dict[str, Any] = {}
+        self.current_control_type: Optional[str] = None
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Adapted from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        image,
+        subject_images,
+        spatial_images,
+        latents=None,
+        cond_size=512,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        height_cond = 2 * (cond_size // (self.vae_scale_factor * 2))
+        width_cond = 2 * (cond_size // (self.vae_scale_factor * 2))
+        image_latents = image_ids = None
+        # Prepare noise latents
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            noise_latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            noise_latents = latents.to(device=device, dtype=dtype)
+        noise_latents = self._pack_latents(noise_latents, batch_size, num_channels_latents, height, width)
+        # print(noise_latents.shape)
+        noise_latent_image_ids, cond_latent_image_ids_resized = resize_position_encoding(
+            batch_size, height, width, height_cond, width_cond, device, dtype
+        )
+        # noise IDs are marked with 0 in the first channel
+        noise_latent_image_ids[..., 0] = 0
+        cond_latents_to_concat = []
+        latents_ids_to_concat = [noise_latent_image_ids]
+        # 1. Prepare `image` (Kontext) latents
+        if image is not None:
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] != self.latent_channels:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            else:
+                image_latents = image
+            image_latent_h, image_latent_w = image_latents.shape[2:]
+            image_latents = self._pack_latents(
+                image_latents, batch_size, num_channels_latents, image_latent_h, image_latent_w
+            )
+            image_ids = self._prepare_latent_image_ids(
+                batch_size, image_latent_h // 2, image_latent_w // 2, device, dtype
+            )
+            image_ids[..., 0] = 1  # Mark as condition
+            latents_ids_to_concat.append(image_ids)
+        # 2. Prepare `subject_images` latents
+        if subject_images is not None and len(subject_images) > 0:
+            subject_images = subject_images.to(device=device, dtype=dtype)
+            subject_image_latents = self._encode_vae_image(image=subject_images, generator=generator)
+            subject_latents = self._pack_latents(
+                subject_image_latents, batch_size, num_channels_latents, height_cond * len(subject_images), width_cond
+            )
+            latent_subject_ids = prepare_latent_subject_ids(height_cond // 2, width_cond // 2, device, dtype)
+            latent_subject_ids[..., 0] = 1
+            latent_subject_ids[:, 1] += image_latent_h // 2
+            subject_latent_image_ids = torch.cat([latent_subject_ids for _ in range(len(subject_images))], dim=0)
+            cond_latents_to_concat.append(subject_latents)
+            latents_ids_to_concat.append(subject_latent_image_ids)
+        # 3. Prepare `spatial_images` latents
+        if spatial_images is not None and len(spatial_images) > 0:
+            spatial_images = spatial_images.to(device=device, dtype=dtype)
+            spatial_image_latents = self._encode_vae_image(image=spatial_images, generator=generator)
+            cond_latents = self._pack_latents(
+                spatial_image_latents, batch_size, num_channels_latents, height_cond * len(spatial_images), width_cond
+            )
+            cond_latent_image_ids_resized[..., 0] = 2
+            cond_latent_image_ids = torch.cat(
+                [cond_latent_image_ids_resized for _ in range(len(spatial_images))], dim=0
+            )
+            cond_latents_to_concat.append(cond_latents)
+            latents_ids_to_concat.append(cond_latent_image_ids)
+        cond_latents = torch.cat(cond_latents_to_concat, dim=1) if cond_latents_to_concat else None
+        latent_image_ids = torch.cat(latents_ids_to_concat, dim=0)
+        return noise_latents, image_latents, cond_latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        cond_size: int = 512,
+        control_dict: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+            cond_size (`int`, *optional*, defaults to 512):
+                The size for conditioning images.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        spatial_images = control_dict.get("spatial_images", [])
+        subject_images = control_dict.get("subject_images", [])
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 3. Preprocess images
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            img = image[0] if isinstance(image, list) else image
+            image_height, image_width = self.image_processor.get_default_height_width(img)
+            aspect_ratio = image_width / image_height
+            # Kontext is trained on specific resolutions, using one of them is recommended
+            _, image_width, image_height = min(
+                (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+            )
+            multiple_of = self.vae_scale_factor * 2
+            image_width = image_width // multiple_of * multiple_of
+            image_height = image_height // multiple_of * multiple_of
+            image = self.image_processor.resize(image, image_height, image_width)
+            image = self.image_processor.preprocess(image, image_height, image_width)
+            image = image.to(dtype=self.vae.dtype)
+        if len(subject_images) > 0:
+            subject_image_ls = []
+            for subject_image in subject_images:
+                w, h = subject_image.size[:2]
+                scale = cond_size / max(h, w)
+                new_h, new_w = int(h * scale), int(w * scale)
+                subject_image = self.image_processor.preprocess(subject_image, height=new_h, width=new_w)
+                subject_image = subject_image.to(dtype=self.vae.dtype)
+                pad_h = cond_size - subject_image.shape[-2]
+                pad_w = cond_size - subject_image.shape[-1]
+                subject_image = pad(
+                    subject_image, padding=(int(pad_w / 2), int(pad_h / 2), int(pad_w / 2), int(pad_h / 2)), fill=0
+                )
+                subject_image_ls.append(subject_image)
+            subject_images = torch.cat(subject_image_ls, dim=-2)
+        else:
+            subject_images = None
+        if len(spatial_images) > 0:
+            condition_image_ls = []
+            for img in spatial_images:
+                condition_image = self.image_processor.preprocess(img, height=cond_size, width=cond_size)
+                condition_image = condition_image.to(dtype=self.vae.dtype)
+                condition_image_ls.append(condition_image)
+            spatial_images = torch.cat(condition_image_ls, dim=-2)
+        else:
+            spatial_images = None
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, image_latents, cond_latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            image,
+            subject_images,
+            spatial_images,
+            latents,
+            cond_size,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+                self._current_timestep = t
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    cond_hidden_states=cond_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred[:, : latents.size(1)]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

train/src/prompt_helper.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+def load_text_encoders(args, class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_clip(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_t5(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=512,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def encode_token_ids(text_encoders, tokens, accelerator, num_images_per_prompt=1, device=None):
+    text_encoder_clip = text_encoders[0]
+    text_encoder_t5 = text_encoders[1]
+    tokens_clip, tokens_t5 = tokens[0], tokens[1]
+    batch_size = tokens_clip.shape[0]
+    if device == "cpu":
+        device = "cpu"
+    else:
+        device = accelerator.device
+    # clip
+    prompt_embeds = text_encoder_clip(tokens_clip.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    pooled_prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # t5
+    prompt_embeds = text_encoder_t5(tokens_t5.to(device))[0]
+    dtype = text_encoder_t5.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=accelerator.device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=accelerator.device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

train/src/transformer_flux.py ADDED Viewed

	@@ -0,0 +1,625 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        use_cond = cond_hidden_states is not None
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        if use_cond:
+            residual_cond = cond_hidden_states
+            norm_cond_hidden_states, cond_gate = self.norm(cond_hidden_states, emb=cond_temb)
+            mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_hidden_states))
+        norm_hidden_states_concat = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states_concat,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        if use_cond:
+            attn_output, cond_attn_output = attn_output
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cond:
+            condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+            cond_gate = cond_gate.unsqueeze(1)
+            condition_latents = cond_gate * self.proj_out(condition_latents)
+            condition_latents = residual_cond + condition_latents
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states, condition_latents if use_cond else None
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        use_cond = cond_hidden_states is not None
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        if use_cond:
+                (
+                    norm_cond_hidden_states,
+                    cond_gate_msa,
+                    cond_shift_mlp,
+                    cond_scale_mlp,
+                    cond_gate_mlp,
+                ) = self.norm1(cond_hidden_states, emb=cond_temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        norm_hidden_states = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        attn_output, context_attn_output = attention_outputs[:2]
+        cond_attn_output = attention_outputs[2] if use_cond else None
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        if use_cond:
+            cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+            cond_hidden_states = cond_hidden_states + cond_attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if use_cond:
+            norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            norm_cond_hidden_states = (
+                norm_cond_hidden_states * (1 + cond_scale_mlp[:, None])
+                + cond_shift_mlp[:, None]
+            )
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if use_cond:
+            cond_ff_output = self.ff(norm_cond_hidden_states)
+            cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+            cond_hidden_states = cond_hidden_states + cond_ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states, cond_hidden_states if use_cond else None
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
+):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module=None, enable=False, gradient_checkpointing_func=None):
+        # Align with diffusers' enable_gradient_checkpointing API which may call
+        # without a `module` argument and pass only keyword args.
+        # Toggle on both the provided module (if any) and on self for safety.
+        if module is not None and hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = enable
+        if hasattr(self, "gradient_checkpointing"):
+            self.gradient_checkpointing = enable
+        # Optionally store the provided function for future use.
+        if gradient_checkpointing_func is not None:
+            setattr(self, "_gradient_checkpointing_func", gradient_checkpointing_func)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if cond_hidden_states is not None:
+            use_condition = True
+        else:
+            use_condition = False
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        if cond_hidden_states is not None:
+            if cond_hidden_states.shape[-1] == self.x_embedder.in_features:
+                cond_hidden_states = self.x_embedder(cond_hidden_states)
+            elif cond_hidden_states.shape[-1] == 64:
+                # 只用前64列权重和bias
+                weight = self.x_embedder.weight[:, :64]  # [inner_dim, 64]
+                bias = self.x_embedder.bias
+                cond_hidden_states = torch.nn.functional.linear(cond_hidden_states, weight, bias)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        cond_temb = (
+            self.time_text_embed(torch.ones_like(timestep) * 0, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(
+                    torch.ones_like(timestep) * 0, guidance, pooled_projections
+                )
+            )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                if use_condition:
+                    encoder_hidden_states, hidden_states, cond_hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        cond_hidden_states,
+                        encoder_hidden_states,
+                        temb,
+                        cond_temb,
+                        image_rotary_emb,
+                        joint_attention_kwargs,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    encoder_hidden_states, hidden_states, _ = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        None,
+                        encoder_hidden_states,
+                        temb,
+                        None,
+                        image_rotary_emb,
+                        joint_attention_kwargs,
+                        **ckpt_kwargs,
+                    )
+            else:
+                encoder_hidden_states, hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                if use_condition:
+                    hidden_states, cond_hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        cond_hidden_states,
+                        temb,
+                        cond_temb,
+                        image_rotary_emb,
+                        joint_attention_kwargs,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states, _ = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        None,
+                        temb,
+                        None,
+                        image_rotary_emb,
+                        joint_attention_kwargs,
+                        **ckpt_kwargs,
+                    )
+            else:
+                hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

train/train_kontext_color.py ADDED Viewed

	@@ -0,0 +1,858 @@

+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import re
+from safetensors.torch import save_file
+from PIL import Image
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+import diffusers
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils.torch_utils import is_compiled_module
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from src.prompt_helper import *
+from src.lora_helper import *
+from src.jsonl_datasets_kontext_color import make_train_dataset_inpaint_mask, collate_fn
+from src.pipeline_flux_kontext_control import (
+    FluxKontextControlPipeline,
+    resize_position_encoding,
+    prepare_latent_subject_ids,
+    PREFERRED_KONTEXT_RESOLUTIONS
+)
+from src.transformer_flux import FluxTransformer2DModel
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from src.layers import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+from tqdm.auto import tqdm
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    step,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... Strict per-case evaluation for image, spatial image, and prompt."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    autocast_ctx = nullcontext()
+    # Build per-case evaluation: require equal lengths for image, spatial image, and prompt
+    if args.validation_images is None or args.validation_images == ['None']:
+        raise ValueError("validation_images must be provided and non-empty")
+    if args.validation_prompt is None:
+        raise ValueError("validation_prompt must be provided and non-empty")
+    control_dict_root = dict(pipeline_args.get("control_dict", {})) if pipeline_args is not None else {}
+    spatial_ls = control_dict_root.get("spatial_images", []) or []
+    val_imgs = args.validation_images
+    prompts = args.validation_prompt
+    if not (len(val_imgs) == len(prompts) == len(spatial_ls)):
+        raise ValueError(
+            f"Length mismatch: validation_images={len(val_imgs)}, validation_prompt={len(prompts)}, spatial_images={len(spatial_ls)}"
+        )
+    results = []
+    def _resize_to_preferred(img: Image.Image) -> Image.Image:
+        w, h = img.size
+        aspect_ratio = w / h if h != 0 else 1.0
+        _, target_w, target_h = min(
+            (abs(aspect_ratio - (pref_w / pref_h)), pref_w, pref_h)
+            for (pref_h, pref_w) in PREFERRED_KONTEXT_RESOLUTIONS
+        )
+        return img.resize((target_w, target_h), Image.BICUBIC)
+    # Distributed per-rank assignment: each process handles its own slice of cases
+    num_cases = len(prompts)
+    logger.info(f"Paired validation (distributed): {num_cases} cases across {accelerator.num_processes} ranks")
+    # Indices assigned to this rank
+    rank = accelerator.process_index
+    world_size = accelerator.num_processes
+    local_indices = list(range(rank, num_cases, world_size))
+    local_images = []
+    with autocast_ctx:
+        for idx in local_indices:
+            try:
+                base_img = Image.open(val_imgs[idx]).convert("RGB")
+                resized_img = _resize_to_preferred(base_img)
+            except Exception as e:
+                raise ValueError(f"Failed to load/resize validation image idx={idx}: {e}")
+            case_args = dict(pipeline_args) if pipeline_args is not None else {}
+            case_args.pop("height", None)
+            case_args.pop("width", None)
+            if resized_img is not None:
+                tw, th = resized_img.size
+                case_args["height"] = th
+                case_args["width"] = tw
+            case_control = dict(case_args.get("control_dict", {}))
+            spatial_case = spatial_ls[idx]
+            # Load spatial image if it's a path; else assume it's already an image
+            if isinstance(spatial_case, str):
+                spatial_img = Image.open(spatial_case).convert("RGB")
+            else:
+                spatial_img = spatial_case
+            case_control["spatial_images"] = [spatial_img]
+            case_control["subject_images"] = []
+            case_args["control_dict"] = case_control
+            case_args["prompt"] = prompts[idx]
+            img = pipeline(image=resized_img, **case_args, generator=generator).images[0]
+            local_images.append(img)
+    # Gather all images per rank (pad to equal count) to main process
+    fixed_size = (1024, 1024)
+    max_local = int(math.ceil(num_cases / world_size)) if world_size > 0 else len(local_images)
+    # Build per-rank batch tensors
+    imgs_rank = []
+    idx_rank = []
+    has_rank = []
+    for j in range(max_local):
+        if j < len(local_images):
+            resized = local_images[j].resize(fixed_size, Image.BICUBIC)
+            img_np = np.asarray(resized).astype(np.uint8)
+            imgs_rank.append(torch.from_numpy(img_np))
+            idx_rank.append(local_indices[j])
+            has_rank.append(1)
+        else:
+            imgs_rank.append(torch.from_numpy(np.zeros((fixed_size[1], fixed_size[0], 3), dtype=np.uint8)))
+            idx_rank.append(-1)
+            has_rank.append(0)
+    imgs_rank_tensor = torch.stack([t.to(device=accelerator.device) for t in imgs_rank], dim=0)  # [max_local, H, W, C]
+    idx_rank_tensor = torch.tensor(idx_rank, device=accelerator.device, dtype=torch.long)  # [max_local]
+    has_rank_tensor = torch.tensor(has_rank, device=accelerator.device, dtype=torch.int)   # [max_local]
+    gathered_has = accelerator.gather(has_rank_tensor)        # [world * max_local]
+    gathered_idx = accelerator.gather(idx_rank_tensor)        # [world * max_local]
+    gathered_imgs = accelerator.gather(imgs_rank_tensor)      # [world * max_local, H, W, C]
+    if accelerator.is_main_process:
+        world = int(world_size)
+        slots = int(max_local)
+        try:
+            gathered_has = gathered_has.view(world, slots)
+            gathered_idx = gathered_idx.view(world, slots)
+            gathered_imgs = gathered_imgs.view(world, slots, fixed_size[1], fixed_size[0], 3)
+        except Exception:
+            # Fallback: treat as flat if reshape fails
+            gathered_has = gathered_has.view(-1, 1)
+            gathered_idx = gathered_idx.view(-1, 1)
+            gathered_imgs = gathered_imgs.view(-1, 1, fixed_size[1], fixed_size[0], 3)
+            world = int(gathered_has.shape[0])
+            slots = 1
+        for i in range(world):
+            for j in range(slots):
+                if int(gathered_has[i, j].item()) == 1:
+                    idx = int(gathered_idx[i, j].item())
+                    arr = gathered_imgs[i, j].cpu().numpy()
+                    pil_img = Image.fromarray(arr.astype(np.uint8))
+                    # Resize back to original validation image size
+                    try:
+                        orig = Image.open(val_imgs[idx]).convert("RGB")
+                        pil_img = pil_img.resize(orig.size, Image.BICUBIC)
+                    except Exception:
+                        pass
+                    results.append(pil_img)
+    # Log results (resize to 1024x1024 for saving or external trackers). Skip TensorBoard per request.
+    resized_for_log = [img.resize((1024, 1024), Image.BICUBIC) for img in results]
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            continue
+        if tracker.name == "wandb":
+            tracker.log({
+                phase_name: [wandb.Image(image, caption=f"{i}: {prompts[i] if i < len(prompts) else ''}") for i, image in enumerate(resized_for_log)]
+            })
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return results
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"):
+    text_encoder_config = transformers.PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Training script for Flux Kontext with EasyControl.")
+    parser.add_argument("--lora_num", type=int, default=1, help="number of the lora.")
+    parser.add_argument("--cond_size", type=int, default=512, help="size of the condition data.")
+    parser.add_argument("--mode", type=str, default=None, help="Controller mode; kept for compatibility.")
+    parser.add_argument("--train_data_dir", type=str, default="", help="Path to JSONL dataset.")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="", required=False, help="Base model path")
+    parser.add_argument("--pretrained_lora_path", type=str, default=None, required=False, help="LoRA checkpoint to initialize from")
+    parser.add_argument("--revision", type=str, default=None, required=False, help="Revision of pretrained model")
+    parser.add_argument("--variant", type=str, default=None, help="Variant of the model files")
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+    parser.add_argument("--max_sequence_length", type=int, default=128, help="Max sequence length for T5")
+    parser.add_argument("--kontext", type=str, default="disable")
+    parser.add_argument("--validation_prompt", type=str, nargs="+", default=None)
+    parser.add_argument("--validation_images", type=str, nargs="+", default=None, help="List of valiadation images")
+    parser.add_argument("--subject_test_images", type=str, nargs="+", default=None, help="List of subject test images")
+    parser.add_argument("--spatial_test_images", type=str, nargs="+", default=None, help="List of spatial test images")
+    parser.add_argument("--num_validation_images", type=int, default=4)
+    parser.add_argument("--validation_steps", type=int, default=20)
+    parser.add_argument("--ranks", type=int, nargs="+", default=[128], help="LoRA ranks")
+    parser.add_argument("--network_alphas", type=int, nargs="+", default=[128], help="LoRA network alphas")
+    parser.add_argument("--output_dir", type=str, default="/tiamat-NAS/zhangyuxuan/projects2/Easy_Control_0120/single_models/subject_model", help="Output directory")
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--train_batch_size", type=int, default=1)
+    parser.add_argument("--num_train_epochs", type=int, default=50)
+    parser.add_argument("--max_train_steps", type=int, default=None)
+    parser.add_argument("--checkpointing_steps", type=int, default=1000)
+    parser.add_argument("--checkpoints_total_limit", type=int, default=None)
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Flux Kontext is guidance distilled")
+    parser.add_argument("--scale_lr", action="store_true", default=False)
+    parser.add_argument("--lr_scheduler", type=str, default="constant")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500)
+    parser.add_argument("--lr_num_cycles", type=int, default=1)
+    parser.add_argument("--lr_power", type=float, default=1.0)
+    parser.add_argument("--dataloader_num_workers", type=int, default=1)
+    parser.add_argument("--weighting_scheme", type=str, default="none", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"])
+    parser.add_argument("--logit_mean", type=float, default=0.0)
+    parser.add_argument("--logit_std", type=float, default=1.0)
+    parser.add_argument("--mode_scale", type=float, default=1.29)
+    parser.add_argument("--optimizer", type=str, default="AdamW")
+    parser.add_argument("--use_8bit_adam", action="store_true")
+    parser.add_argument("--adam_beta1", type=float, default=0.9)
+    parser.add_argument("--adam_beta2", type=float, default=0.999)
+    parser.add_argument("--prodigy_beta3", type=float, default=None)
+    parser.add_argument("--prodigy_decouple", type=bool, default=True)
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04)
+    parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=1e-03)
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
+    parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True)
+    parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--logging_dir", type=str, default="logs")
+    parser.add_argument("--cache_latents", action="store_true", default=False)
+    parser.add_argument("--report_to", type=str, default="tensorboard")
+    parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["no", "fp16", "bf16"])
+    parser.add_argument("--upcast_before_saving", action="store_true", default=False)
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    return args
+def main(args):
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        os.makedirs(args.logging_dir, exist_ok=True)
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Install wandb for logging during training.")
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    if args.seed is not None:
+        set_seed(args.seed)
+    if accelerator.is_main_process and args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # Tokenizers
+    tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision
+    )
+    # Text encoders
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder")
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2")
+    # Scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(args, text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant)
+    transformer = FluxTransformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant)
+    # Train only LoRA adapters
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    vae.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    # Setup LoRA attention processors
+    if args.pretrained_lora_path is not None:
+        lora_path = args.pretrained_lora_path
+        checkpoint = load_checkpoint(lora_path)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        number = 1
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+    else:
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            else:
+                lora_attn_procs[name] = attn_processor
+    transformer.set_attn_processor(lora_attn_procs)
+    transformer.train()
+    for n, param in transformer.named_parameters():
+        if '_lora' not in n:
+            param.requires_grad = False
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'M parameters')
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    if args.resume_from_checkpoint:
+        path = args.resume_from_checkpoint
+        global_step = int(path.split("-")[-1])
+        initial_global_step = global_step
+    else:
+        initial_global_step = 0
+        global_step = 0
+        first_epoch = 0
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        cast_training_params(models, dtype=torch.float32)
+    params_to_optimize = [p for p in transformer.parameters() if p.requires_grad]
+    transformer_parameters_with_lr = {"params": params_to_optimize, "lr": args.learning_rate}
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'parameters')
+    optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        [transformer_parameters_with_lr],
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    tokenizers = [tokenizer_one, tokenizer_two]
+    text_encoders = [text_encoder_one, text_encoder_two]
+    train_dataset = make_train_dataset_inpaint_mask(args, tokenizers, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    vae_config_shift_factor = vae.config.shift_factor
+    vae_config_scaling_factor = vae.config.scaling_factor
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.resume_from_checkpoint:
+        first_epoch = global_step // num_update_steps_per_epoch
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Sanitize config for TensorBoard hparams (only allow int/float/bool/str/tensor). Others are stringified if possible; otherwise dropped
+    def _sanitize_hparams(config_dict):
+        sanitized = {}
+        for key, value in dict(config_dict).items():
+            try:
+                if value is None:
+                    continue
+                # numpy scalar types
+                if isinstance(value, (np.integer,)):
+                    sanitized[key] = int(value)
+                elif isinstance(value, (np.floating,)):
+                    sanitized[key] = float(value)
+                elif isinstance(value, (int, float, bool, str)):
+                    sanitized[key] = value
+                elif isinstance(value, Path):
+                    sanitized[key] = str(value)
+                elif isinstance(value, (list, tuple)):
+                    # stringify simple sequences; skip if fails
+                    sanitized[key] = str(value)
+                else:
+                    # best-effort stringify
+                    sanitized[key] = str(value)
+            except Exception:
+                # skip unconvertible entries
+                continue
+        return sanitized
+    if accelerator.is_main_process:
+        tracker_name = "Easy_Control_Kontext"
+        accelerator.init_trackers(tracker_name, config=_sanitize_hparams(vars(args)))
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Kontext specifics
+    vae_scale_factor = 8  # Kontext uses 8x VAE factor; pack/unpack uses additional 2x in methods
+    # Match pipeline's prepare_latents cond resolution: 2 * (cond_size // (vae_scale_factor * 2))
+    height_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    width_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    offset = 64
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                tokens = [batch["text_ids_1"], batch["text_ids_2"]]
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_token_ids(text_encoders, tokens, accelerator)
+                prompt_embeds = prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                text_ids = text_ids.to(dtype=vae.dtype, device=accelerator.device)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                height_ = 2 * (int(pixel_values.shape[-2]) // (vae_scale_factor * 2))
+                width_ = 2 * (int(pixel_values.shape[-1]) // (vae_scale_factor * 2))
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                    model_input.shape[0], height_, width_, height_cond, width_cond, accelerator.device, weight_dtype
+                )
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                packed_noisy_model_input = FluxKontextControlPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+                latent_image_ids_to_concat = [latent_image_ids]
+                packed_cond_model_input_to_concat = []
+                if args.kontext == "enable":
+                    source_pixel_values = batch["source_pixel_values"].to(dtype=vae.dtype)
+                    source_image_latents = vae.encode(source_pixel_values).latent_dist.sample()
+                    source_image_latents = (source_image_latents - vae_config_shift_factor) * vae_config_scaling_factor
+                    image_latent_h, image_latent_w = source_image_latents.shape[2:]
+                    packed_image_latents = FluxKontextControlPipeline._pack_latents(
+                        source_image_latents,
+                        batch_size=source_image_latents.shape[0],
+                        num_channels_latents=source_image_latents.shape[1],
+                        height=image_latent_h,
+                        width=image_latent_w,
+                    )
+                    source_image_ids = FluxKontextControlPipeline._prepare_latent_image_ids(
+                        batch_size=source_image_latents.shape[0],
+                        height=image_latent_h // 2,
+                        width=image_latent_w // 2,
+                        device=accelerator.device,
+                        dtype=weight_dtype,
+                    )
+                    source_image_ids[..., 0] = 1  # Mark as condition
+                    latent_image_ids_to_concat.append(source_image_ids)
+                subject_pixel_values = batch.get("subject_pixel_values")
+                if subject_pixel_values is not None:
+                    subject_pixel_values = subject_pixel_values.to(dtype=vae.dtype)
+                    subject_input = vae.encode(subject_pixel_values).latent_dist.sample()
+                    subject_input = (subject_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    subject_input = subject_input.to(dtype=weight_dtype)
+                    sub_number = subject_pixel_values.shape[-2] // args.cond_size
+                    latent_subject_ids = prepare_latent_subject_ids(height_cond // 2, width_cond // 2, accelerator.device, weight_dtype)
+                    latent_subject_ids[..., 0] = 2
+                    latent_subject_ids[:, 1] += offset
+                    sub_latent_image_ids = torch.cat([latent_subject_ids for _ in range(sub_number)], dim=0)
+                    latent_image_ids_to_concat.append(sub_latent_image_ids)
+                    packed_subject_model_input = FluxKontextControlPipeline._pack_latents(
+                        subject_input,
+                        batch_size=subject_input.shape[0],
+                        num_channels_latents=subject_input.shape[1],
+                        height=subject_input.shape[2],
+                        width=subject_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_subject_model_input)
+                cond_pixel_values = batch.get("cond_pixel_values")
+                if cond_pixel_values is not None:
+                    cond_pixel_values = cond_pixel_values.to(dtype=vae.dtype)
+                    cond_input = vae.encode(cond_pixel_values).latent_dist.sample()
+                    cond_input = (cond_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    cond_input = cond_input.to(dtype=weight_dtype)
+                    cond_number = cond_pixel_values.shape[-2] // args.cond_size
+                    cond_latent_image_ids[..., 0] = 2
+                    cond_latent_image_ids_rep = torch.cat([cond_latent_image_ids for _ in range(cond_number)], dim=0)
+                    latent_image_ids_to_concat.append(cond_latent_image_ids_rep)
+                    packed_cond_model_input = FluxKontextControlPipeline._pack_latents(
+                        cond_input,
+                        batch_size=cond_input.shape[0],
+                        num_channels_latents=cond_input.shape[1],
+                        height=cond_input.shape[2],
+                        width=cond_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_cond_model_input)
+                latent_image_ids = torch.cat(latent_image_ids_to_concat, dim=0)
+                cond_packed_noisy_model_input = torch.cat(packed_cond_model_input_to_concat, dim=1)
+                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+                latent_model_input=packed_noisy_model_input
+                if args.kontext == "enable":
+                    latent_model_input = torch.cat([latent_model_input, packed_image_latents], dim=1)
+                model_pred = transformer(
+                    hidden_states=latent_model_input,
+                    cond_hidden_states=cond_packed_noisy_model_input,
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1)]
+                model_pred = FluxKontextControlPipeline._unpack_latents(
+                    model_pred,
+                    height=int(pixel_values.shape[-2]),
+                    width=int(pixel_values.shape[-1]),
+                    vae_scale_factor=vae_scale_factor,
+                )
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                target = noise - model_input
+                loss = torch.mean((weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (transformer.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints")
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        os.makedirs(save_path, exist_ok=True)
+                        unwrapped_model_state = accelerator.unwrap_model(transformer).state_dict()
+                        lora_state_dict = {k: unwrapped_model_state[k] for k in unwrapped_model_state.keys() if '_lora' in k}
+                        save_file(lora_state_dict, os.path.join(save_path, "lora.safetensors"))
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                # Create pipeline on every rank to run validation in parallel
+                pipeline = FluxKontextControlPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                if args.spatial_test_images is not None and len(args.spatial_test_images) != 0 and args.spatial_test_images != ['None']:
+                    spatial_paths = args.spatial_test_images
+                else:
+                    spatial_paths = []
+                pipeline_args = {
+                    "prompt": args.validation_prompt,
+                    "cond_size": args.cond_size,
+                    "guidance_scale": 3.5,
+                    "num_inference_steps": 20,
+                    "max_sequence_length": 128,
+                    "control_dict": {"spatial_images": spatial_paths},
+                }
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    step=global_step,
+                    torch_dtype=weight_dtype,
+                )
+                # Only main process saves/logs
+                if accelerator.is_main_process:
+                    save_path = os.path.join(args.output_dir, "validation")
+                    os.makedirs(save_path, exist_ok=True)
+                    save_folder = os.path.join(save_path, f"checkpoint-{global_step}")
+                    os.makedirs(save_folder, exist_ok=True)
+                    for idx, img in enumerate(images):
+                        img.save(os.path.join(save_folder, f"{idx}.jpg"))
+                del pipeline
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train/train_kontext_color.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+export MODEL_DIR="" # your flux path
+export OUTPUT_DIR=""  # your save path
+export CONFIG="./default_config.yaml"
+export TRAIN_DATA="" # your data jsonl file
+export LOG_PATH="$OUTPUT_DIR/log"
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file $CONFIG train_kontext_color.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --lora_num=1 \
+    --cond_size=512 \
+    --ranks 128 \
+    --network_alphas 128 \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="bf16" \
+    --train_data_dir=$TRAIN_DATA \
+    --learning_rate=1e-4 \
+    --train_batch_size=1 \
+    --num_train_epochs=1 \
+    --validation_steps=100 \
+    --checkpointing_steps=1000 \
+    --validation_images "./kontext_color_test/img_1.png" \
+    --spatial_test_images "./kontext_color_test/color_1.png"  \
+    --validation_prompt "Let this woman have red purple and blue hair" \
+    --num_validation_images=1

train/train_kontext_complete_lora.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+export MODEL_DIR="" # your flux path
+export OUTPUT_DIR=""  # your save path
+export CONFIG="./default_config.yaml"
+export LOG_PATH="$OUTPUT_DIR/log"
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file $CONFIG train_kontext_lora.py \
+    --train_data_jsonl "" \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="bf16" \
+    --learning_rate=1e-4 \
+    --train_batch_size=1 \
+    --num_train_epochs=5 \
+    --validation_steps=100 \
+    --checkpointing_steps=500 \
+    --validation_images "./kontext_complete_test/img_1.png" \
+    --validation_prompt "" \
+    --gradient_checkpointing \
+    --num_validation_images=1

train/train_kontext_edge.py ADDED Viewed

	@@ -0,0 +1,814 @@

+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import re
+from safetensors.torch import save_file
+from PIL import Image
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+import diffusers
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils.torch_utils import is_compiled_module
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from src.prompt_helper import *
+from src.lora_helper import *
+from src.jsonl_datasets_kontext_edge import make_train_dataset_inpaint_mask, collate_fn
+from src.pipeline_flux_kontext_control import (
+    FluxKontextControlPipeline,
+    resize_position_encoding,
+    prepare_latent_subject_ids,
+    PREFERRED_KONTEXT_RESOLUTIONS
+)
+from src.transformer_flux import FluxTransformer2DModel
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from src.layers import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+from tqdm.auto import tqdm
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    step,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... Strict per-case evaluation for image, spatial image, and prompt."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    autocast_ctx = nullcontext()
+    # Build per-case evaluation: require equal lengths for image, spatial image, and prompt
+    if args.validation_images is None or args.validation_images == ['None']:
+        raise ValueError("validation_images must be provided and non-empty")
+    if args.validation_prompt is None:
+        raise ValueError("validation_prompt must be provided and non-empty")
+    control_dict_root = dict(pipeline_args.get("control_dict", {})) if pipeline_args is not None else {}
+    spatial_ls = control_dict_root.get("spatial_images", []) or []
+    val_imgs = args.validation_images
+    prompts = args.validation_prompt
+    if not (len(val_imgs) == len(prompts) == len(spatial_ls)):
+        raise ValueError(
+            f"Length mismatch: validation_images={len(val_imgs)}, validation_prompt={len(prompts)}, spatial_images={len(spatial_ls)}"
+        )
+    results = []
+    def _resize_to_preferred(img: Image.Image) -> Image.Image:
+        w, h = img.size
+        aspect_ratio = w / h if h != 0 else 1.0
+        _, target_w, target_h = min(
+            (abs(aspect_ratio - (pref_w / pref_h)), pref_w, pref_h)
+            for (pref_h, pref_w) in PREFERRED_KONTEXT_RESOLUTIONS
+        )
+        return img.resize((target_w, target_h), Image.BICUBIC)
+    # Strict per-case loop
+    num_cases = len(prompts)
+    logger.info(f"Paired validation: {num_cases} (image, spatial, prompt) cases")
+    with autocast_ctx:
+        for idx in range(num_cases):
+            resized_img = None
+            # If validation image path is a non-empty string, load and resize; otherwise, skip passing image
+            if isinstance(val_imgs[idx], str) and val_imgs[idx] != "":
+                try:
+                    base_img = Image.open(val_imgs[idx]).convert("RGB")
+                    resized_img = _resize_to_preferred(base_img)
+                except Exception as e:
+                    raise ValueError(f"Failed to load/resize validation image idx={idx}: {e}")
+            case_args = dict(pipeline_args) if pipeline_args is not None else {}
+            case_args.pop("height", None)
+            case_args.pop("width", None)
+            if resized_img is not None:
+                tw, th = resized_img.size
+                case_args["height"] = th
+                case_args["width"] = tw
+            else:
+                # When no image is provided, default to 1024x1024
+                case_args["height"] = 1024
+                case_args["width"] = 1024
+            # Bind single spatial control image per case; pass it directly (no masking)
+            case_control = dict(case_args.get("control_dict", {}))
+            spatial_case = spatial_ls[idx]
+            # Load spatial image if it's a path; else assume it's already an image
+            try:
+                spatial_img = Image.open(spatial_case).convert("RGB") if isinstance(spatial_case, str) else spatial_case
+            except Exception:
+                spatial_img = spatial_case
+            case_control["spatial_images"] = [spatial_img]
+            case_control["subject_images"] = []
+            case_args["control_dict"] = case_control
+            # Override prompt per case
+            case_args["prompt"] = prompts[idx]
+            if resized_img is not None:
+                img = pipeline(image=resized_img, **case_args, generator=generator).images[0]
+            else:
+                img = pipeline(**case_args, generator=generator).images[0]
+            results.append(img)
+    # Log results (resize to 1024x1024 for logging only)
+    resized_for_log = [img.resize((1024, 1024), Image.BICUBIC) for img in results]
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in resized_for_log])
+            tracker.writer.add_images(phase_name, np_images, step, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log({
+                phase_name: [wandb.Image(image, caption=f"{i}: {prompts[i] if i < len(prompts) else ''}") for i, image in enumerate(resized_for_log)]
+            })
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return results
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"):
+    text_encoder_config = transformers.PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Training script for Flux Kontext with EasyControl.")
+    parser.add_argument("--lora_num", type=int, default=1, help="number of the lora.")
+    parser.add_argument("--cond_size", type=int, default=512, help="size of the condition data.")
+    parser.add_argument("--mode", type=str, default=None, help="Controller mode; kept for compatibility.")
+    parser.add_argument("--train_data_dir", type=str, default="", help="Path to JSONL dataset.")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="", required=False, help="Base model path")
+    parser.add_argument("--pretrained_lora_path", type=str, default=None, required=False, help="LoRA checkpoint to initialize from")
+    parser.add_argument("--revision", type=str, default=None, required=False, help="Revision of pretrained model")
+    parser.add_argument("--variant", type=str, default=None, help="Variant of the model files")
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+    parser.add_argument("--max_sequence_length", type=int, default=128, help="Max sequence length for T5")
+    parser.add_argument("--kontext", type=str, default="disable")
+    parser.add_argument("--validation_prompt", type=str, nargs="+", default=None)
+    parser.add_argument("--validation_images", type=str, nargs="+", default=None, help="List of valiadation images")
+    parser.add_argument("--subject_test_images", type=str, nargs="+", default=None, help="List of subject test images")
+    parser.add_argument("--spatial_test_images", type=str, nargs="+", default=None, help="List of spatial test images")
+    parser.add_argument("--num_validation_images", type=int, default=4)
+    parser.add_argument("--validation_steps", type=int, default=20)
+    parser.add_argument("--ranks", type=int, nargs="+", default=[128], help="LoRA ranks")
+    parser.add_argument("--network_alphas", type=int, nargs="+", default=[128], help="LoRA network alphas")
+    parser.add_argument("--output_dir", type=str, default="/tiamat-NAS/zhangyuxuan/projects2/Easy_Control_0120/single_models/subject_model", help="Output directory")
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--train_batch_size", type=int, default=1)
+    parser.add_argument("--num_train_epochs", type=int, default=50)
+    parser.add_argument("--max_train_steps", type=int, default=None)
+    parser.add_argument("--checkpointing_steps", type=int, default=1000)
+    parser.add_argument("--checkpoints_total_limit", type=int, default=None)
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Flux Kontext is guidance distilled")
+    parser.add_argument("--scale_lr", action="store_true", default=False)
+    parser.add_argument("--lr_scheduler", type=str, default="constant")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500)
+    parser.add_argument("--lr_num_cycles", type=int, default=1)
+    parser.add_argument("--lr_power", type=float, default=1.0)
+    parser.add_argument("--dataloader_num_workers", type=int, default=1)
+    parser.add_argument("--weighting_scheme", type=str, default="none", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"])
+    parser.add_argument("--logit_mean", type=float, default=0.0)
+    parser.add_argument("--logit_std", type=float, default=1.0)
+    parser.add_argument("--mode_scale", type=float, default=1.29)
+    parser.add_argument("--optimizer", type=str, default="AdamW")
+    parser.add_argument("--use_8bit_adam", action="store_true")
+    parser.add_argument("--adam_beta1", type=float, default=0.9)
+    parser.add_argument("--adam_beta2", type=float, default=0.999)
+    parser.add_argument("--prodigy_beta3", type=float, default=None)
+    parser.add_argument("--prodigy_decouple", type=bool, default=True)
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04)
+    parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=1e-03)
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
+    parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True)
+    parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--logging_dir", type=str, default="logs")
+    parser.add_argument("--cache_latents", action="store_true", default=False)
+    parser.add_argument("--report_to", type=str, default="tensorboard")
+    parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["no", "fp16", "bf16"])
+    parser.add_argument("--upcast_before_saving", action="store_true", default=False)
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    return args
+def main(args):
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        os.makedirs(args.logging_dir, exist_ok=True)
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Install wandb for logging during training.")
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    if args.seed is not None:
+        set_seed(args.seed)
+    if accelerator.is_main_process and args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # Tokenizers
+    tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision
+    )
+    # Text encoders
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder")
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2")
+    # Scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(args, text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant)
+    transformer = FluxTransformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant)
+    # Train only LoRA adapters
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    vae.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    # Setup LoRA attention processors
+    if args.pretrained_lora_path is not None:
+        lora_path = args.pretrained_lora_path
+        checkpoint = load_checkpoint(lora_path)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        number = 1
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+    else:
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            else:
+                lora_attn_procs[name] = attn_processor
+    transformer.set_attn_processor(lora_attn_procs)
+    transformer.train()
+    for n, param in transformer.named_parameters():
+        if '_lora' not in n:
+            param.requires_grad = False
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'M parameters')
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    if args.resume_from_checkpoint:
+        path = args.resume_from_checkpoint
+        global_step = int(path.split("-")[-1])
+        initial_global_step = global_step
+    else:
+        initial_global_step = 0
+        global_step = 0
+        first_epoch = 0
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        cast_training_params(models, dtype=torch.float32)
+    params_to_optimize = [p for p in transformer.parameters() if p.requires_grad]
+    transformer_parameters_with_lr = {"params": params_to_optimize, "lr": args.learning_rate}
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'parameters')
+    optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        [transformer_parameters_with_lr],
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    tokenizers = [tokenizer_one, tokenizer_two]
+    text_encoders = [text_encoder_one, text_encoder_two]
+    train_dataset = make_train_dataset_inpaint_mask(args, tokenizers, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    vae_config_shift_factor = vae.config.shift_factor
+    vae_config_scaling_factor = vae.config.scaling_factor
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.resume_from_checkpoint:
+        first_epoch = global_step // num_update_steps_per_epoch
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Sanitize config for TensorBoard hparams (only allow int/float/bool/str/tensor). Others are stringified if possible; otherwise dropped
+    def _sanitize_hparams(config_dict):
+        sanitized = {}
+        for key, value in dict(config_dict).items():
+            try:
+                if value is None:
+                    continue
+                # numpy scalar types
+                if isinstance(value, (np.integer,)):
+                    sanitized[key] = int(value)
+                elif isinstance(value, (np.floating,)):
+                    sanitized[key] = float(value)
+                elif isinstance(value, (int, float, bool, str)):
+                    sanitized[key] = value
+                elif isinstance(value, Path):
+                    sanitized[key] = str(value)
+                elif isinstance(value, (list, tuple)):
+                    # stringify simple sequences; skip if fails
+                    sanitized[key] = str(value)
+                else:
+                    # best-effort stringify
+                    sanitized[key] = str(value)
+            except Exception:
+                # skip unconvertible entries
+                continue
+        return sanitized
+    if accelerator.is_main_process:
+        tracker_name = "Easy_Control_Kontext"
+        accelerator.init_trackers(tracker_name, config=_sanitize_hparams(vars(args)))
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Kontext specifics
+    vae_scale_factor = 8  # Kontext uses 8x VAE factor; pack/unpack uses additional 2x in methods
+    # Match pipeline's prepare_latents cond resolution: 2 * (cond_size // (vae_scale_factor * 2))
+    height_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    width_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    offset = 64
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                tokens = [batch["text_ids_1"], batch["text_ids_2"]]
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_token_ids(text_encoders, tokens, accelerator)
+                prompt_embeds = prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                text_ids = text_ids.to(dtype=vae.dtype, device=accelerator.device)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                height_ = 2 * (int(pixel_values.shape[-2]) // (vae_scale_factor * 2))
+                width_ = 2 * (int(pixel_values.shape[-1]) // (vae_scale_factor * 2))
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                    model_input.shape[0], height_, width_, height_cond, width_cond, accelerator.device, weight_dtype
+                )
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                packed_noisy_model_input = FluxKontextControlPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+                latent_image_ids_to_concat = [latent_image_ids]
+                packed_cond_model_input_to_concat = []
+                if args.kontext == "enable":
+                    source_pixel_values = batch["source_pixel_values"].to(dtype=vae.dtype)
+                    source_image_latents = vae.encode(source_pixel_values).latent_dist.sample()
+                    source_image_latents = (source_image_latents - vae_config_shift_factor) * vae_config_scaling_factor
+                    image_latent_h, image_latent_w = source_image_latents.shape[2:]
+                    packed_image_latents = FluxKontextControlPipeline._pack_latents(
+                        source_image_latents,
+                        batch_size=source_image_latents.shape[0],
+                        num_channels_latents=source_image_latents.shape[1],
+                        height=image_latent_h,
+                        width=image_latent_w,
+                    )
+                    source_image_ids = FluxKontextControlPipeline._prepare_latent_image_ids(
+                        batch_size=source_image_latents.shape[0],
+                        height=image_latent_h // 2,
+                        width=image_latent_w // 2,
+                        device=accelerator.device,
+                        dtype=weight_dtype,
+                    )
+                    source_image_ids[..., 0] = 1  # Mark as condition
+                    latent_image_ids_to_concat.append(source_image_ids)
+                subject_pixel_values = batch.get("subject_pixel_values")
+                if subject_pixel_values is not None:
+                    subject_pixel_values = subject_pixel_values.to(dtype=vae.dtype)
+                    subject_input = vae.encode(subject_pixel_values).latent_dist.sample()
+                    subject_input = (subject_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    subject_input = subject_input.to(dtype=weight_dtype)
+                    sub_number = subject_pixel_values.shape[-2] // args.cond_size
+                    latent_subject_ids = prepare_latent_subject_ids(height_cond // 2, width_cond // 2, accelerator.device, weight_dtype)
+                    latent_subject_ids[..., 0] = 2
+                    latent_subject_ids[:, 1] += offset
+                    sub_latent_image_ids = torch.cat([latent_subject_ids for _ in range(sub_number)], dim=0)
+                    latent_image_ids_to_concat.append(sub_latent_image_ids)
+                    packed_subject_model_input = FluxKontextControlPipeline._pack_latents(
+                        subject_input,
+                        batch_size=subject_input.shape[0],
+                        num_channels_latents=subject_input.shape[1],
+                        height=subject_input.shape[2],
+                        width=subject_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_subject_model_input)
+                cond_pixel_values = batch.get("cond_pixel_values")
+                if cond_pixel_values is not None:
+                    cond_pixel_values = cond_pixel_values.to(dtype=vae.dtype)
+                    cond_input = vae.encode(cond_pixel_values).latent_dist.sample()
+                    cond_input = (cond_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    cond_input = cond_input.to(dtype=weight_dtype)
+                    cond_number = cond_pixel_values.shape[-2] // args.cond_size
+                    cond_latent_image_ids[..., 0] = 2
+                    cond_latent_image_ids_rep = torch.cat([cond_latent_image_ids for _ in range(cond_number)], dim=0)
+                    latent_image_ids_to_concat.append(cond_latent_image_ids_rep)
+                    packed_cond_model_input = FluxKontextControlPipeline._pack_latents(
+                        cond_input,
+                        batch_size=cond_input.shape[0],
+                        num_channels_latents=cond_input.shape[1],
+                        height=cond_input.shape[2],
+                        width=cond_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_cond_model_input)
+                latent_image_ids = torch.cat(latent_image_ids_to_concat, dim=0)
+                cond_packed_noisy_model_input = torch.cat(packed_cond_model_input_to_concat, dim=1)
+                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+                latent_model_input=packed_noisy_model_input
+                if args.kontext == "enable":
+                    latent_model_input = torch.cat([latent_model_input, packed_image_latents], dim=1)
+                model_pred = transformer(
+                    hidden_states=latent_model_input,
+                    cond_hidden_states=cond_packed_noisy_model_input,
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1)]
+                model_pred = FluxKontextControlPipeline._unpack_latents(
+                    model_pred,
+                    height=int(pixel_values.shape[-2]),
+                    width=int(pixel_values.shape[-1]),
+                    vae_scale_factor=vae_scale_factor,
+                )
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                target = noise - model_input
+                loss = torch.mean((weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (transformer.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints")
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        os.makedirs(save_path, exist_ok=True)
+                        unwrapped_model_state = accelerator.unwrap_model(transformer).state_dict()
+                        lora_state_dict = {k: unwrapped_model_state[k] for k in unwrapped_model_state.keys() if '_lora' in k}
+                        save_file(lora_state_dict, os.path.join(save_path, "lora.safetensors"))
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.is_main_process:
+                if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                    pipeline = FluxKontextControlPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        vae=vae,
+                        text_encoder=accelerator.unwrap_model(text_encoder_one),
+                        text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                        transformer=accelerator.unwrap_model(transformer),
+                        revision=args.revision,
+                        variant=args.variant,
+                        torch_dtype=weight_dtype,
+                    )
+                    if args.subject_test_images is not None and len(args.subject_test_images) != 0 and args.subject_test_images != ['None']:
+                        subject_paths = args.subject_test_images
+                        subject_ls = [Image.open(image_path).convert("RGB") for image_path in subject_paths]
+                    else:
+                        subject_ls = []
+                    if args.spatial_test_images is not None and len(args.spatial_test_images) != 0 and args.spatial_test_images != ['None']:
+                        spatial_paths = args.spatial_test_images
+                        spatial_ls = [Image.open(image_path).convert("RGB") for image_path in spatial_paths]
+                    else:
+                        spatial_ls = []
+                    pipeline_args = {
+                        "prompt": args.validation_prompt,
+                        "cond_size": args.cond_size,
+                        "guidance_scale": 3.5,
+                        "num_inference_steps": 20,
+                        "max_sequence_length": 128,
+                        "control_dict": {"spatial_images": spatial_ls, "subject_images": subject_ls},
+                    }
+                    images = log_validation(
+                        pipeline=pipeline,
+                        args=args,
+                        accelerator=accelerator,
+                        pipeline_args=pipeline_args,
+                        step=global_step,
+                        torch_dtype=weight_dtype,
+                    )
+                    save_path = os.path.join(args.output_dir, "validation")
+                    os.makedirs(save_path, exist_ok=True)
+                    save_folder = os.path.join(save_path, f"checkpoint-{global_step}")
+                    os.makedirs(save_folder, exist_ok=True)
+                    for idx, img in enumerate(images):
+                        img.save(os.path.join(save_folder, f"{idx}.jpg"))
+                    del pipeline
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train/train_kontext_edge.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+export MODEL_DIR="/robby/share/Editing/lzc/FLUX.1-Kontext-dev" # your flux path
+export OUTPUT_DIR="/robby/share/Editing/lzc/EasyControl_kontext_edge_test_hed"  # your save path
+export CONFIG="./default_config.yaml"
+export TRAIN_DATA="/robby/share/MM/zkc/data/i2i_csv/pexel_Qwen2_5VL7BInstruct.csv " # your data jsonl file
+export LOG_PATH="$OUTPUT_DIR/log"
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file $CONFIG train_kontext_edge.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --lora_num=1 \
+    --cond_size=512 \
+    --ranks 128 \
+    --network_alphas 128 \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="bf16" \
+    --train_data_dir=$TRAIN_DATA \
+    --learning_rate=1e-4 \
+    --train_batch_size=1 \
+    --num_train_epochs=1 \
+    --validation_steps=500 \
+    --checkpointing_steps=1000 \
+    --validation_images "./kontext_edge_test/img_1.png" "./kontext_edge_test/img_2.png" "" "" "./kontext_edge_test/img_3.png" \
+    --spatial_test_images "./kontext_edge_test/edge_1.png" "./kontext_edge_test/edge_2.png" "./kontext_edge_test/edge_1.png" "./kontext_edge_test/edge_2.png" "./kontext_edge_test/edge_3.png" \
+    --validation_prompt "The cake was cut off a piece" "Let this black woman wearing a transparent sunglasses" "This image shows a beautifully decorated cake with golden-orange sides and white frosting on top, and a piece of cake is being cut. The cake is displayed on a rustic wooden slice that serves as a cake stand." "This is a striking portrait photograph featuring a person wearing an ornate golden crown and a heart-shape sunglasses. The subject has dramatic golden metallic eyeshadow that extends across their eyelids, complementing the warm tones of the crown." "move the cup to the left" \
+    --num_validation_images=1

train/train_kontext_interactive_lora.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+export MODEL_DIR="" # your flux path
+export OUTPUT_DIR=""  # your save path
+export CONFIG="./default_config.yaml"
+export LOG_PATH="$OUTPUT_DIR/log"
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file $CONFIG train_kontext_lora.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="bf16" \
+    --learning_rate=1e-4 \
+    --train_batch_size=1 \
+    --num_train_epochs=10 \
+    --validation_steps=100 \
+    --checkpointing_steps=500 \
+    --validation_images "./kontext_interactive_test/img_1.png" \
+    --validation_prompt "Let the man hold the AK47 using both hands." \
+    --num_validation_images=1

train/train_kontext_local.py ADDED Viewed

	@@ -0,0 +1,876 @@

+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import re
+from safetensors.torch import save_file
+from PIL import Image
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+import diffusers
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils.torch_utils import is_compiled_module
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from src.prompt_helper import *
+from src.lora_helper import *
+from src.jsonl_datasets_kontext_local import make_train_dataset_mixed, collate_fn
+from src.pipeline_flux_kontext_control import (
+    FluxKontextControlPipeline,
+    resize_position_encoding,
+    prepare_latent_subject_ids,
+    PREFERRED_KONTEXT_RESOLUTIONS
+)
+from src.transformer_flux import FluxTransformer2DModel
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from src.layers import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor
+from tqdm.auto import tqdm
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+def compute_background_preserving_loss(model_pred, target, mask_values, weighting, background_weight: float = 3.0):
+    """
+    Compute loss with higher penalty on background (non-masked) regions to preserve them.
+    model_pred/target: [B, C, H, W]
+    mask_values: [B, 1, H_img, W_img] with values in {0,1} at image resolution
+    weighting: broadcastable to [B, C, H, W]
+    Returns per-pixel loss map [B, C, H, W]
+    """
+    base_loss = (weighting.float() * (model_pred.float() - target.float()) ** 2)
+    mask_latent = torch.nn.functional.interpolate(
+        mask_values,
+        size=(model_pred.shape[2], model_pred.shape[3]),
+        mode='bilinear',
+        align_corners=False,
+    )
+    foreground_mask = mask_latent
+    background_mask = 1.0 - mask_latent
+    foreground_mask = foreground_mask.expand_as(base_loss)
+    background_mask = background_mask.expand_as(base_loss)
+    foreground_loss = base_loss * foreground_mask
+    background_loss = base_loss * background_mask * float(background_weight)
+    total_loss = foreground_loss + background_loss
+    return total_loss
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    step,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... Strict per-case evaluation for image, spatial image, and prompt."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    autocast_ctx = nullcontext()
+    # Build per-case evaluation: require equal lengths for image, spatial image, and prompt
+    if args.validation_images is None or args.validation_images == ['None']:
+        raise ValueError("validation_images must be provided and non-empty")
+    if args.validation_prompt is None:
+        raise ValueError("validation_prompt must be provided and non-empty")
+    control_dict_root = dict(pipeline_args.get("control_dict", {})) if pipeline_args is not None else {}
+    spatial_ls = control_dict_root.get("spatial_images", []) or []
+    val_imgs = args.validation_images
+    prompts = args.validation_prompt
+    if not (len(val_imgs) == len(prompts) == len(spatial_ls)):
+        raise ValueError(
+            f"Length mismatch: validation_images={len(val_imgs)}, validation_prompt={len(prompts)}, spatial_images={len(spatial_ls)}"
+        )
+    results = []
+    def _resize_to_preferred(img: Image.Image) -> Image.Image:
+        w, h = img.size
+        aspect_ratio = w / h if h != 0 else 1.0
+        _, target_w, target_h = min(
+            (abs(aspect_ratio - (pref_w / pref_h)), pref_w, pref_h)
+            for (pref_h, pref_w) in PREFERRED_KONTEXT_RESOLUTIONS
+        )
+        return img.resize((target_w, target_h), Image.BICUBIC)
+    # Distributed per-rank assignment: each process handles its own slice of cases
+    num_cases = len(prompts)
+    logger.info(f"Paired validation (distributed): {num_cases} cases across {accelerator.num_processes} ranks")
+    rank = accelerator.process_index
+    world_size = accelerator.num_processes
+    local_indices = list(range(rank, num_cases, world_size))
+    local_images = []
+    with autocast_ctx:
+        for idx in local_indices:
+            try:
+                base_img = Image.open(val_imgs[idx]).convert("RGB")
+                resized_img = _resize_to_preferred(base_img)
+            except Exception as e:
+                raise ValueError(f"Failed to load/resize validation image idx={idx}: {e}")
+            case_args = dict(pipeline_args) if pipeline_args is not None else {}
+            case_args.pop("height", None)
+            case_args.pop("width", None)
+            if resized_img is not None:
+                tw, th = resized_img.size
+                case_args["height"] = th
+                case_args["width"] = tw
+            case_control = dict(case_args.get("control_dict", {}))
+            spatial_case = spatial_ls[idx]
+            # Compose masked image cond: resized_img * (1 - binary_mask)
+            try:
+                mask_img = Image.open(spatial_case).convert("L") if isinstance(spatial_case, str) else spatial_case.convert("L")
+            except Exception:
+                mask_img = spatial_case.convert("L")
+            mask_img = mask_img.resize(resized_img.size, Image.NEAREST)
+            mask_np = np.array(mask_img)
+            mask_bin = (mask_np > 127).astype(np.uint8)
+            inv_mask = (1 - mask_bin).astype(np.uint8)
+            base_np = np.array(resized_img)
+            masked_np = base_np * inv_mask[..., None]
+            masked_img = Image.fromarray(masked_np.astype(np.uint8))
+            case_control["spatial_images"] = [masked_img]
+            case_args["control_dict"] = case_control
+            case_args["prompt"] = prompts[idx]
+            img = pipeline(image=resized_img, **case_args, generator=generator).images[0]
+            local_images.append(img)
+    # Gather one image per rank (pad missing ranks with black images) to main process
+    fixed_size = (1024, 1024)
+    has_sample = torch.tensor([1 if len(local_images) > 0 else 0], device=accelerator.device, dtype=torch.int)
+    local_idx = torch.tensor([local_indices[0] if len(local_indices) > 0 else -1], device=accelerator.device, dtype=torch.long)
+    if len(local_images) > 0:
+        gathered_img = local_images[0].resize(fixed_size, Image.BICUBIC)
+        img_np = np.asarray(gathered_img).astype(np.uint8)
+    else:
+        img_np = np.zeros((fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
+    img_tensor = torch.from_numpy(img_np).to(device=accelerator.device)
+    if img_tensor.ndim == 3:
+        img_tensor = img_tensor.unsqueeze(0)
+    gathered_has = accelerator.gather(has_sample)
+    gathered_idx = accelerator.gather(local_idx)
+    gathered_imgs = accelerator.gather(img_tensor)
+    if accelerator.is_main_process:
+        for i in range(int(gathered_has.shape[0])):
+            if int(gathered_has[i].item()) == 1:
+                idx = int(gathered_idx[i].item())
+                arr = gathered_imgs[i].cpu().numpy()
+                pil_img = Image.fromarray(arr.astype(np.uint8))
+                # Resize back to original validation image size
+                try:
+                    orig = Image.open(val_imgs[idx]).convert("RGB")
+                    pil_img = pil_img.resize(orig.size, Image.BICUBIC)
+                except Exception:
+                    pass
+                results.append(pil_img)
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return results
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"):
+    text_encoder_config = transformers.PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Training script for Flux Kontext with EasyControl.")
+    parser.add_argument("--lora_num", type=int, default=1, help="number of the lora.")
+    parser.add_argument("--cond_size", type=int, default=512, help="size of the condition data.")
+    parser.add_argument("--mode", type=str, default=None, help="Controller mode; kept for compatibility.")
+    # New dataset (local edits + inpaint JSONL) mixed 1:1
+    parser.add_argument("--local_edits_json", type=str, default="/robby/share/Editing/qingyan/InstructV2V/Qwen2_5_72B_instructs_10W.json", help="Path to local edits JSON")
+    parser.add_argument("--train_data_dir", type=str, default="/robby/share/Editing/lzc/data/pexel_final/inpaint_edit_outputs_merged.jsonl", help="Path to inpaint JSONL file for mixing 1:1")
+    parser.add_argument("--source_frames_dir", type=str, default="/robby/share/Editing/qingyan/InstructV2V/pexel-video-merged-1frame", help="Root dir containing group folders like 0139")
+    parser.add_argument("--target_frames_dir", type=str, default="/robby/share/Editing/qingyan/InstructV2V/pexel-video-1frame-kontext-edit/local", help="Root dir containing group folders like 0139")
+    parser.add_argument("--masks_dir", type=str, default="/robby/share/Editing/lzc/InstructV2V/diff_masks", help="Root dir of precomputed masks organized as <group>/<prefix>_{i}.png")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="", required=False, help="Base model path")
+    parser.add_argument("--pretrained_lora_path", type=str, default=None, required=False, help="LoRA checkpoint to initialize from")
+    parser.add_argument("--revision", type=str, default=None, required=False, help="Revision of pretrained model")
+    parser.add_argument("--variant", type=str, default=None, help="Variant of the model files")
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+    parser.add_argument("--max_sequence_length", type=int, default=128, help="Max sequence length for T5")
+    parser.add_argument("--kontext", type=str, default="enable")
+    parser.add_argument("--validation_prompt", type=str, nargs="+", default=None)
+    parser.add_argument("--validation_images", type=str, nargs="+", default=None, help="List of valiadation images")
+    parser.add_argument("--subject_test_images", type=str, nargs="+", default=None, help="List of subject test images")
+    parser.add_argument("--spatial_test_images", type=str, nargs="+", default=None, help="List of spatial test images")
+    parser.add_argument("--num_validation_images", type=int, default=4)
+    parser.add_argument("--validation_steps", type=int, default=20)
+    parser.add_argument("--ranks", type=int, nargs="+", default=[256], help="LoRA ranks")
+    parser.add_argument("--network_alphas", type=int, nargs="+", default=[256], help="LoRA network alphas")
+    parser.add_argument("--output_dir", type=str, default="/tiamat-NAS/zhangyuxuan/projects2/Easy_Control_0120/single_models/subject_model", help="Output directory")
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--train_batch_size", type=int, default=1)
+    parser.add_argument("--num_train_epochs", type=int, default=50)
+    parser.add_argument("--max_train_steps", type=int, default=None)
+    parser.add_argument("--checkpointing_steps", type=int, default=1000)
+    parser.add_argument("--checkpoints_total_limit", type=int, default=None)
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Flux Kontext is guidance distilled")
+    parser.add_argument("--scale_lr", action="store_true", default=False)
+    parser.add_argument("--lr_scheduler", type=str, default="constant")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500)
+    parser.add_argument("--lr_num_cycles", type=int, default=1)
+    parser.add_argument("--lr_power", type=float, default=1.0)
+    parser.add_argument("--dataloader_num_workers", type=int, default=8)
+    parser.add_argument("--weighting_scheme", type=str, default="none", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"])
+    parser.add_argument("--logit_mean", type=float, default=0.0)
+    parser.add_argument("--logit_std", type=float, default=1.0)
+    parser.add_argument("--mode_scale", type=float, default=1.29)
+    parser.add_argument("--optimizer", type=str, default="AdamW")
+    parser.add_argument("--use_8bit_adam", action="store_true")
+    parser.add_argument("--adam_beta1", type=float, default=0.9)
+    parser.add_argument("--adam_beta2", type=float, default=0.999)
+    parser.add_argument("--prodigy_beta3", type=float, default=None)
+    parser.add_argument("--prodigy_decouple", type=bool, default=True)
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04)
+    parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=1e-03)
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
+    parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True)
+    parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--logging_dir", type=str, default="logs")
+    parser.add_argument("--cache_latents", action="store_true", default=False)
+    parser.add_argument("--report_to", type=str, default="tensorboard")
+    parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["no", "fp16", "bf16"])
+    parser.add_argument("--upcast_before_saving", action="store_true", default=False)
+    parser.add_argument("--mix_ratio", type=float, default=0, help="Ratio of inpaint to local edits (B per A). 0=only local edits, 1=1:1, 2=1:2")
+    parser.add_argument("--background_weight", type=float, default=1.0, help="Background preserving loss weight multiplier")
+    # Blending options for dataset pixel_values
+    parser.add_argument("--blend_pixel_values", action="store_true", help="Blend target/source into pixel_values using mask")
+    parser.add_argument("--blend_kernel", type=int, default=21, help="Gaussian blur kernel size (must be odd)")
+    parser.add_argument("--blend_sigma", type=float, default=10.0, help="Gaussian blur sigma")
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    return args
+def main(args):
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        os.makedirs(args.logging_dir, exist_ok=True)
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Install wandb for logging during training.")
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    if args.seed is not None:
+        set_seed(args.seed)
+    if accelerator.is_main_process and args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # Tokenizers
+    tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision
+    )
+    # Text encoders
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder")
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2")
+    # Scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(args, text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant)
+    transformer = FluxTransformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant)
+    # Train only LoRA adapters
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    vae.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    # Setup LoRA attention processors
+    if args.pretrained_lora_path is not None:
+        lora_path = args.pretrained_lora_path
+        checkpoint = load_checkpoint(lora_path)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        number = 1
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+    else:
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=args.ranks, network_alphas=args.network_alphas, lora_weights=[1 for _ in range(args.lora_num)], device=accelerator.device, dtype=weight_dtype, cond_width=args.cond_size, cond_height=args.cond_size, n_loras=args.lora_num
+                )
+            else:
+                lora_attn_procs[name] = attn_processor
+    transformer.set_attn_processor(lora_attn_procs)
+    transformer.train()
+    for n, param in transformer.named_parameters():
+        if '_lora' not in n:
+            param.requires_grad = False
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'M parameters')
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    if args.resume_from_checkpoint:
+        path = args.resume_from_checkpoint
+        global_step = int(path.split("-")[-1])
+        initial_global_step = global_step
+    else:
+        initial_global_step = 0
+        global_step = 0
+        first_epoch = 0
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        cast_training_params(models, dtype=torch.float32)
+    params_to_optimize = [p for p in transformer.parameters() if p.requires_grad]
+    transformer_parameters_with_lr = {"params": params_to_optimize, "lr": args.learning_rate}
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'parameters')
+    optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        [transformer_parameters_with_lr],
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    tokenizers = [tokenizer_one, tokenizer_two]
+    text_encoders = [text_encoder_one, text_encoder_two]
+    train_dataset = make_train_dataset_mixed(args, tokenizers, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    vae_config_shift_factor = vae.config.shift_factor
+    vae_config_scaling_factor = vae.config.scaling_factor
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.resume_from_checkpoint:
+        first_epoch = global_step // num_update_steps_per_epoch
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Sanitize config for TensorBoard hparams (only allow int/float/bool/str/tensor). Others are stringified if possible; otherwise dropped
+    def _sanitize_hparams(config_dict):
+        sanitized = {}
+        for key, value in dict(config_dict).items():
+            try:
+                if value is None:
+                    continue
+                # numpy scalar types
+                if isinstance(value, (np.integer,)):
+                    sanitized[key] = int(value)
+                elif isinstance(value, (np.floating,)):
+                    sanitized[key] = float(value)
+                elif isinstance(value, (int, float, bool, str)):
+                    sanitized[key] = value
+                elif isinstance(value, Path):
+                    sanitized[key] = str(value)
+                elif isinstance(value, (list, tuple)):
+                    # stringify simple sequences; skip if fails
+                    sanitized[key] = str(value)
+                else:
+                    # best-effort stringify
+                    sanitized[key] = str(value)
+            except Exception:
+                # skip unconvertible entries
+                continue
+        return sanitized
+    if accelerator.is_main_process:
+        tracker_name = "Easy_Control_Kontext"
+        accelerator.init_trackers(tracker_name, config=_sanitize_hparams(vars(args)))
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Kontext specifics
+    vae_scale_factor = 8  # Kontext uses 8x VAE factor; pack/unpack uses additional 2x in methods
+    # Match pipeline's prepare_latents cond resolution: 2 * (cond_size // (vae_scale_factor * 2))
+    height_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    width_cond = 2 * (args.cond_size // (vae_scale_factor * 2))
+    offset = 64
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                tokens = [batch["text_ids_1"], batch["text_ids_2"]]
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_token_ids(text_encoders, tokens, accelerator)
+                prompt_embeds = prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                text_ids = text_ids.to(dtype=vae.dtype, device=accelerator.device)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                height_ = 2 * (int(pixel_values.shape[-2]) // (vae_scale_factor * 2))
+                width_ = 2 * (int(pixel_values.shape[-1]) // (vae_scale_factor * 2))
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                    model_input.shape[0], height_, width_, height_cond, width_cond, accelerator.device, weight_dtype
+                )
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                packed_noisy_model_input = FluxKontextControlPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+                latent_image_ids_to_concat = [latent_image_ids]
+                packed_cond_model_input_to_concat = []
+                if args.kontext == "enable":
+                    source_pixel_values = batch["source_pixel_values"].to(dtype=vae.dtype)
+                    source_image_latents = vae.encode(source_pixel_values).latent_dist.sample()
+                    source_image_latents = (source_image_latents - vae_config_shift_factor) * vae_config_scaling_factor
+                    image_latent_h, image_latent_w = source_image_latents.shape[2:]
+                    packed_image_latents = FluxKontextControlPipeline._pack_latents(
+                        source_image_latents,
+                        batch_size=source_image_latents.shape[0],
+                        num_channels_latents=source_image_latents.shape[1],
+                        height=image_latent_h,
+                        width=image_latent_w,
+                    )
+                    source_image_ids = FluxKontextControlPipeline._prepare_latent_image_ids(
+                        batch_size=source_image_latents.shape[0],
+                        height=image_latent_h // 2,
+                        width=image_latent_w // 2,
+                        device=accelerator.device,
+                        dtype=weight_dtype,
+                    )
+                    source_image_ids[..., 0] = 1  # Mark as condition
+                    latent_image_ids_to_concat.append(source_image_ids)
+                subject_pixel_values = batch.get("subject_pixel_values")
+                if subject_pixel_values is not None:
+                    subject_pixel_values = subject_pixel_values.to(dtype=vae.dtype)
+                    subject_input = vae.encode(subject_pixel_values).latent_dist.sample()
+                    subject_input = (subject_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    subject_input = subject_input.to(dtype=weight_dtype)
+                    sub_number = subject_pixel_values.shape[-2] // args.cond_size
+                    latent_subject_ids = prepare_latent_subject_ids(height_cond // 2, width_cond // 2, accelerator.device, weight_dtype)
+                    latent_subject_ids[..., 0] = 2
+                    latent_subject_ids[:, 1] += offset
+                    sub_latent_image_ids = torch.cat([latent_subject_ids for _ in range(sub_number)], dim=0)
+                    latent_image_ids_to_concat.append(sub_latent_image_ids)
+                    packed_subject_model_input = FluxKontextControlPipeline._pack_latents(
+                        subject_input,
+                        batch_size=subject_input.shape[0],
+                        num_channels_latents=subject_input.shape[1],
+                        height=subject_input.shape[2],
+                        width=subject_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_subject_model_input)
+                cond_pixel_values = batch.get("cond_pixel_values")
+                if cond_pixel_values is not None:
+                    cond_pixel_values = cond_pixel_values.to(dtype=vae.dtype)
+                    cond_input = vae.encode(cond_pixel_values).latent_dist.sample()
+                    cond_input = (cond_input - vae_config_shift_factor) * vae_config_scaling_factor
+                    cond_input = cond_input.to(dtype=weight_dtype)
+                    cond_number = cond_pixel_values.shape[-2] // args.cond_size
+                    cond_latent_image_ids[..., 0] = 2
+                    cond_latent_image_ids_rep = torch.cat([cond_latent_image_ids for _ in range(cond_number)], dim=0)
+                    latent_image_ids_to_concat.append(cond_latent_image_ids_rep)
+                    packed_cond_model_input = FluxKontextControlPipeline._pack_latents(
+                        cond_input,
+                        batch_size=cond_input.shape[0],
+                        num_channels_latents=cond_input.shape[1],
+                        height=cond_input.shape[2],
+                        width=cond_input.shape[3],
+                    )
+                    packed_cond_model_input_to_concat.append(packed_cond_model_input)
+                latent_image_ids = torch.cat(latent_image_ids_to_concat, dim=0)
+                cond_packed_noisy_model_input = torch.cat(packed_cond_model_input_to_concat, dim=1)
+                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+                latent_model_input=packed_noisy_model_input
+                if args.kontext == "enable":
+                    latent_model_input = torch.cat([latent_model_input, packed_image_latents], dim=1)
+                model_pred = transformer(
+                    hidden_states=latent_model_input,
+                    cond_hidden_states=cond_packed_noisy_model_input,
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1)]
+                model_pred = FluxKontextControlPipeline._unpack_latents(
+                    model_pred,
+                    height=int(pixel_values.shape[-2]),
+                    width=int(pixel_values.shape[-1]),
+                    vae_scale_factor=vae_scale_factor,
+                )
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                target = noise - model_input
+                # mask_values = batch.get("mask_values")
+                # if mask_values is not None:
+                #     mask_values = mask_values.to(device=accelerator.device, dtype=model_pred.dtype)
+                #     loss_map = compute_background_preserving_loss(
+                #         model_pred=model_pred,
+                #         target=target,
+                #         mask_values=mask_values,
+                #         weighting=weighting,
+                #         background_weight=args.background_weight,
+                #     )
+                #     loss = torch.mean(loss_map.reshape(target.shape[0], -1), 1)
+                #     loss = loss.mean()
+                # else:
+                loss = torch.mean((weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (transformer.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints")
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        os.makedirs(save_path, exist_ok=True)
+                        unwrapped_model_state = accelerator.unwrap_model(transformer).state_dict()
+                        lora_state_dict = {k: unwrapped_model_state[k] for k in unwrapped_model_state.keys() if '_lora' in k}
+                        save_file(lora_state_dict, os.path.join(save_path, "lora.safetensors"))
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                pipeline = FluxKontextControlPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                if args.spatial_test_images is not None and len(args.spatial_test_images) != 0 and args.spatial_test_images != ['None']:
+                    spatial_paths = args.spatial_test_images
+                    spatial_ls = [Image.open(image_path).convert("RGB") for image_path in spatial_paths]
+                else:
+                    spatial_ls = []
+                pipeline_args = {
+                    "prompt": args.validation_prompt,
+                    "cond_size": args.cond_size,
+                    "guidance_scale": 3.5,
+                    "num_inference_steps": 20,
+                    "max_sequence_length": 128,
+                    "control_dict": {"spatial_images": spatial_ls},
+                }
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    step=global_step,
+                    torch_dtype=weight_dtype,
+                )
+                if accelerator.is_main_process:
+                    save_path = os.path.join(args.output_dir, "validation")
+                    os.makedirs(save_path, exist_ok=True)
+                    save_folder = os.path.join(save_path, f"checkpoint-{global_step}")
+                    os.makedirs(save_folder, exist_ok=True)
+                    for idx, img in enumerate(images):
+                        img.save(os.path.join(save_folder, f"{idx}.jpg"))
+                del pipeline
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train/train_kontext_local.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+export MODEL_DIR="" # your flux path
+export OUTPUT_DIR=""  # your save path
+export CONFIG="./default_config.yaml"
+export LOG_PATH="$OUTPUT_DIR/log"
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file $CONFIG train_kontext_qy.py \
+    --pretrained_model_name_or_path $MODEL_DIR \
+    --pretrained_lora_path "" \
+    --lora_num=1 \
+    --cond_size=512 \
+    --ranks 128 \
+    --network_alphas 128 \
+    --output_dir=$OUTPUT_DIR \
+    --logging_dir=$LOG_PATH \
+    --mixed_precision="bf16" \
+    --learning_rate=1e-4 \
+    --train_batch_size=1 \
+    --num_train_epochs=1 \
+    --validation_steps=250 \
+    --checkpointing_steps=1000 \
+    --validation_images "./kontext_local_test/img_1.png"  \
+    --spatial_test_images "./kontext_local_test/mask_1.png" \
+    --validation_prompt "convert the dinosaur into blue color" \
+    --gradient_checkpointing \
+    --blend_pixel_values \
+    --num_validation_images=1

train/train_kontext_lora.py ADDED Viewed

	@@ -0,0 +1,871 @@

+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import re
+import time
+from safetensors.torch import save_file
+from PIL import Image
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+import diffusers
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+)
+from diffusers.utils.torch_utils import is_compiled_module
+from diffusers.utils import (
+    check_min_version,
+    is_wandb_available,
+)
+from src.prompt_helper import *
+from src.lora_helper import *
+from src.jsonl_datasets_kontext_interactive_lora import make_interactive_dataset_subjects, make_placement_dataset_subjects, make_pexels_dataset_subjects, make_mixed_dataset, collate_fn
+from diffusers import FluxKontextPipeline
+from diffusers.models import FluxTransformer2DModel
+from tqdm.auto import tqdm
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from diffusers.utils import convert_state_dict_to_diffusers
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    step,
+    torch_dtype,
+    is_final_validation=False,
+):
+    logger.info(
+        f"Running validation... Paired evaluation for image and prompt."
+    )
+    pipeline = pipeline.to(device=accelerator.device, dtype=torch_dtype)
+    pipeline.set_progress_bar_config(disable=True)
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    # Match compute dtype for validation to avoid dtype mismatches (e.g., VAE bf16 vs float latents)
+    if torch_dtype in (torch.float16, torch.bfloat16):
+        device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
+        autocast_ctx = torch.autocast(device_type=device_type, dtype=torch_dtype)
+    else:
+        autocast_ctx = nullcontext()
+    # Build per-case evaluation
+    if args.validation_images is None or args.validation_images == ['None']:
+        raise ValueError("validation_images must be provided and non-empty")
+    if args.validation_prompt is None:
+        raise ValueError("validation_prompt must be provided and non-empty")
+    val_imgs = args.validation_images
+    prompts = args.validation_prompt
+    # Prepend instruction to each prompt (same as dataset/test requirement)
+    instruction = "Fill in the white region naturally and adapt the foreground into the background. Fix the perspective of the foreground object if necessary."
+    try:
+        prompts = [f"{instruction} {p}".strip() if isinstance(p, str) and len(p.strip()) > 0 else instruction for p in prompts]
+    except Exception:
+        # Fallback: keep original prompts if unexpected
+        pass
+    if not (len(val_imgs) == len(prompts)):
+        raise ValueError(
+            f"Length mismatch: validation_images={len(val_imgs)}, validation_prompt={len(prompts)}"
+        )
+    results = []
+    def _resize_to_preferred(img: Image.Image) -> Image.Image:
+        w, h = img.size
+        aspect_ratio = w / h if h != 0 else 1.0
+        _, target_w, target_h = min(
+            (abs(aspect_ratio - (pref_w / pref_h)), pref_w, pref_h)
+            for (pref_h, pref_w) in PREFERRED_KONTEXT_RESOLUTIONS
+        )
+        return img.resize((target_w, target_h), Image.BICUBIC)
+    # Distributed per-rank assignment: each process handles its own slice of cases
+    num_cases = len(prompts)
+    logger.info(f"Paired validation (distributed): {num_cases} cases across {accelerator.num_processes} ranks")
+    # Indices assigned to this rank
+    rank = accelerator.process_index
+    world_size = accelerator.num_processes
+    local_indices = list(range(rank, num_cases, world_size))
+    local_images = []
+    with autocast_ctx:
+        for idx in local_indices:
+            try:
+                base_img = Image.open(val_imgs[idx]).convert("RGB")
+                resized_img = _resize_to_preferred(base_img)
+            except Exception as e:
+                raise ValueError(f"Failed to load/resize validation image idx={idx}: {e}")
+            case_args = dict(pipeline_args) if pipeline_args is not None else {}
+            case_args.pop("height", None)
+            case_args.pop("width", None)
+            if resized_img is not None:
+                tw, th = resized_img.size
+                case_args["height"] = th
+                case_args["width"] = tw
+            case_args["prompt"] = prompts[idx]
+            img = pipeline(image=resized_img, **case_args, generator=generator).images[0]
+            local_images.append(img)
+    # Gather all images per rank (pad to equal count) to main process
+    fixed_size = (1024, 1024)
+    max_local = int(math.ceil(num_cases / world_size)) if world_size > 0 else len(local_images)
+    # Build per-rank batch tensors
+    imgs_rank = []
+    idx_rank = []
+    has_rank = []
+    for j in range(max_local):
+        if j < len(local_images):
+            resized = local_images[j].resize(fixed_size, Image.BICUBIC)
+            img_np = np.asarray(resized).astype(np.uint8)
+            imgs_rank.append(torch.from_numpy(img_np))
+            idx_rank.append(local_indices[j])
+            has_rank.append(1)
+        else:
+            imgs_rank.append(torch.from_numpy(np.zeros((fixed_size[1], fixed_size[0], 3), dtype=np.uint8)))
+            idx_rank.append(-1)
+            has_rank.append(0)
+    imgs_rank_tensor = torch.stack([t.to(device=accelerator.device) for t in imgs_rank], dim=0)  # [max_local, H, W, C]
+    idx_rank_tensor = torch.tensor(idx_rank, device=accelerator.device, dtype=torch.long)  # [max_local]
+    has_rank_tensor = torch.tensor(has_rank, device=accelerator.device, dtype=torch.int)   # [max_local]
+    gathered_has = accelerator.gather(has_rank_tensor)        # [world * max_local]
+    gathered_idx = accelerator.gather(idx_rank_tensor)        # [world * max_local]
+    gathered_imgs = accelerator.gather(imgs_rank_tensor)      # [world * max_local, H, W, C]
+    if accelerator.is_main_process:
+        world = int(world_size)
+        slots = int(max_local)
+        try:
+            gathered_has = gathered_has.view(world, slots)
+            gathered_idx = gathered_idx.view(world, slots)
+            gathered_imgs = gathered_imgs.view(world, slots, fixed_size[1], fixed_size[0], 3)
+        except Exception:
+            # Fallback: treat as flat if reshape fails
+            gathered_has = gathered_has.view(-1, 1)
+            gathered_idx = gathered_idx.view(-1, 1)
+            gathered_imgs = gathered_imgs.view(-1, 1, fixed_size[1], fixed_size[0], 3)
+            world = int(gathered_has.shape[0])
+            slots = 1
+        for i in range(world):
+            for j in range(slots):
+                if int(gathered_has[i, j].item()) == 1:
+                    idx = int(gathered_idx[i, j].item())
+                    arr = gathered_imgs[i, j].cpu().numpy()
+                    pil_img = Image.fromarray(arr.astype(np.uint8))
+                    # Resize back to original validation image size
+                    try:
+                        orig = Image.open(val_imgs[idx]).convert("RGB")
+                        pil_img = pil_img.resize(orig.size, Image.BICUBIC)
+                    except Exception:
+                        pass
+                    results.append(pil_img)
+    # Log results (resize to 1024x1024 for saving or external trackers). Skip TensorBoard per request.
+    resized_for_log = [img.resize((1024, 1024), Image.BICUBIC) for img in results]
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            continue
+        if tracker.name == "wandb":
+            tracker.log({
+                phase_name: [wandb.Image(image, caption=f"{i}: {prompts[i] if i < len(prompts) else ''}") for i, image in enumerate(resized_for_log)]
+            })
+    del pipeline
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return results
+def save_with_retry(img: Image.Image, path: str, max_retries: int = 3) -> bool:
+    """Save PIL image with simple retry and exponential backoff to mitigate transient I/O errors."""
+    last_err = None
+    for attempt in range(max_retries):
+        try:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            img.save(path)
+            return True
+        except OSError as e:
+            last_err = e
+            # Exponential backoff: 1.0, 1.5, 2.25 seconds ...
+            time.sleep(1.5 ** attempt)
+    logger.warning(f"Failed to save {path} after {max_retries} retries: {last_err}")
+    return False
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"):
+    text_encoder_config = transformers.PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Training script for Flux Kontext with EasyControl.")
+    parser.add_argument("--mode", type=str, default=None, help="Controller mode; kept for compatibility.")
+    # Dataset arguments
+    parser.add_argument("--dataset_mode", type=str, default="mixed", choices=["interactive", "placement", "pexels", "mixed"],
+                        help="Dataset mode: interactive, placement, pexels, or mixed")
+    parser.add_argument("--train_data_jsonl", type=str, default="/robby/share/Editing/lzc/HOI_v1/final_metadata.jsonl",
+                        help="Path to interactive dataset JSONL")
+    parser.add_argument("--placement_data_jsonl", type=str, default="/robby/share/Editing/lzc/subject_placement/metadata_relight.jsonl",
+                        help="Path to placement dataset JSONL")
+    parser.add_argument("--pexels_data_jsonl", type=str, default=None,
+                        help="Path to pexels dataset JSONL")
+    parser.add_argument("--interactive_base_dir", type=str, default="/robby/share/Editing/lzc/HOI_v1",
+                        help="Base directory for interactive dataset")
+    parser.add_argument("--placement_base_dir", type=str, default="/robby/share/Editing/lzc/subject_placement",
+                        help="Base directory for placement dataset")
+    parser.add_argument("--pexels_base_dir", type=str, default=None,
+                        help="Base directory for pexels dataset")
+    parser.add_argument("--pexels_relight_base_dir", type=str, default=None,
+                        help="Base directory for pexels relighted images")
+    parser.add_argument("--seg_base_dir", type=str, default=None,
+                        help="Directory containing segmentation maps for pexels dataset")
+    parser.add_argument("--interactive_weight", type=float, default=1.0,
+                        help="Sampling weight for interactive dataset (default: 1.0)")
+    parser.add_argument("--placement_weight", type=float, default=1.0,
+                        help="Sampling weight for placement dataset (default: 1.0)")
+    parser.add_argument("--pexels_weight", type=float, default=0.1,
+                        help="Sampling weight for pexels dataset (default: 1.0)")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="", required=False, help="Base model path")
+    parser.add_argument("--pretrained_lora_path", type=str, default=None, required=False, help="LoRA checkpoint to initialize from")
+    parser.add_argument("--revision", type=str, default=None, required=False, help="Revision of pretrained model")
+    parser.add_argument("--variant", type=str, default=None, help="Variant of the model files")
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+    parser.add_argument("--max_sequence_length", type=int, default=128, help="Max sequence length for T5")
+    parser.add_argument("--kontext", type=str, default="enable")
+    parser.add_argument("--validation_prompt", type=str, nargs="+", default=None)
+    parser.add_argument("--validation_images", type=str, nargs="+", default=None, help="List of valiadation images")
+    parser.add_argument("--num_validation_images", type=int, default=4)
+    parser.add_argument("--validation_steps", type=int, default=20)
+    parser.add_argument("--ranks", type=int, nargs="+", default=[32], help="LoRA ranks")
+    parser.add_argument("--output_dir", type=str, default="", help="Output directory")
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--train_batch_size", type=int, default=1)
+    parser.add_argument("--num_train_epochs", type=int, default=50)
+    parser.add_argument("--max_train_steps", type=int, default=None)
+    parser.add_argument("--checkpointing_steps", type=int, default=1000)
+    parser.add_argument("--checkpoints_total_limit", type=int, default=None)
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Flux Kontext is guidance distilled")
+    parser.add_argument("--scale_lr", action="store_true", default=False)
+    parser.add_argument("--lr_scheduler", type=str, default="constant")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500)
+    parser.add_argument("--lr_num_cycles", type=int, default=1)
+    parser.add_argument("--lr_power", type=float, default=1.0)
+    parser.add_argument("--dataloader_num_workers", type=int, default=8)
+    parser.add_argument("--weighting_scheme", type=str, default="none", choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"])
+    parser.add_argument("--logit_mean", type=float, default=0.0)
+    parser.add_argument("--logit_std", type=float, default=1.0)
+    parser.add_argument("--mode_scale", type=float, default=1.29)
+    parser.add_argument("--optimizer", type=str, default="AdamW")
+    parser.add_argument("--use_8bit_adam", action="store_true")
+    parser.add_argument("--adam_beta1", type=float, default=0.9)
+    parser.add_argument("--adam_beta2", type=float, default=0.999)
+    parser.add_argument("--prodigy_beta3", type=float, default=None)
+    parser.add_argument("--prodigy_decouple", type=bool, default=True)
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04)
+    parser.add_argument("--adam_weight_decay_text_encoder", type=float, default=1e-03)
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
+    parser.add_argument("--prodigy_use_bias_correction", type=bool, default=True)
+    parser.add_argument("--prodigy_safeguard_warmup", type=bool, default=True)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--logging_dir", type=str, default="logs")
+    parser.add_argument("--cache_latents", action="store_true", default=False)
+    parser.add_argument("--report_to", type=str, default="tensorboard")
+    parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["no", "fp16", "bf16"])
+    parser.add_argument("--upcast_before_saving", action="store_true", default=False)
+    # Blending options for dataset pixel_values
+    parser.add_argument("--blend_pixel_values", action="store_true", help="Blend target/source into pixel_values using mask")
+    parser.add_argument("--blend_kernel", type=int, default=21, help="Gaussian blur kernel size (must be odd)")
+    parser.add_argument("--blend_sigma", type=float, default=10.0, help="Gaussian blur sigma")
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    return args
+def main(args):
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        os.makedirs(args.logging_dir, exist_ok=True)
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Install wandb for logging during training.")
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    if args.seed is not None:
+        set_seed(args.seed)
+    if accelerator.is_main_process and args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # Tokenizers
+    tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision
+    )
+    # Text encoders
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder")
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2")
+    # Scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one, text_encoder_two = load_text_encoders(args, text_encoder_cls_one, text_encoder_cls_two)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant)
+    transformer = FluxTransformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant)
+    # Train only LoRA adapters: freeze base transformer/text encoders/vae
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        raise ValueError("Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 or fp32 instead.")
+    vae.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    # Setup standard PEFT LoRA on FluxTransformer2DModel
+    # target_modules = [
+    #     "attn.to_k",
+    #     "attn.to_q",
+    #     "attn.to_v",
+    #     "attn.to_out.0",
+    #     "attn.add_k_proj",
+    #     "attn.add_q_proj",
+    #     "attn.add_v_proj",
+    #     "attn.to_add_out",
+    #     "ff.net.0.proj",
+    #     "ff.net.2",
+    #     "ff_context.net.0.proj",
+    #     "ff_context.net.2",
+    # ]
+    target_modules = [
+        "attn.to_k",
+        "attn.to_q",
+        "attn.to_v",
+        "attn.to_out.0",
+        "attn.add_k_proj",
+        "attn.add_q_proj",
+        "attn.add_v_proj",
+        "attn.to_add_out",
+        "ff.net.0.proj",
+        "ff.net.2",
+        "ff_context.net.0.proj",
+        "ff_context.net.2",
+        # ===========================================================
+        # 【补全部分 1】: 单流模块 (single_transformer_blocks) 的专属层
+        # ===========================================================
+        # 说明：单流块中的注意力层 (to_q, to_k, to_v) 已被上面的通用名称覆盖。
+        # 这里补充的是它们特有的 MLP 和输出层。
+        "proj_mlp",
+        "proj_out", # 这个名称也会匹配单流块各自的输出层和模型总输出层
+        # ===========================================================
+        # 【补全部分 2】: 所有的归一化 (Norm) 层
+        # ===========================================================
+        # 说明：这些层负责调整特征分布，对风格学习很重要。
+        # 使用 "linear" 可以一次性匹配所有以 ".linear" 结尾的Norm层。
+        "linear", # 匹配 norm1.linear, norm1_context.linear, norm.linear, norm_out.linear
+    ]
+    lora_rank = int(args.ranks[0]) if isinstance(args.ranks, list) and len(args.ranks) > 0 else 256
+    lora_config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_rank,
+        init_lora_weights="gaussian",
+        target_modules=target_modules,
+    )
+    transformer.add_adapter(lora_config)
+    transformer.train()
+    print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'M parameters')
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    if args.resume_from_checkpoint:
+        path = args.resume_from_checkpoint
+        global_step = int(path.split("-")[-1])
+        initial_global_step = global_step
+    else:
+        initial_global_step = 0
+        global_step = 0
+        first_epoch = 0
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        cast_training_params(models, dtype=torch.float32)
+    params_to_optimize = [p for p in transformer.parameters() if p.requires_grad]
+    transformer_parameters_with_lr = {"params": params_to_optimize, "lr": args.learning_rate}
+    # print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]) / 1000000, 'parameters')
+    optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        [transformer_parameters_with_lr],
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    tokenizers = [tokenizer_one, tokenizer_two]
+    text_encoders = [text_encoder_one, text_encoder_two]
+    # Create dataset based on mode
+    if args.dataset_mode == "mixed":
+        # Mixed mode: combine all available datasets
+        train_dataset = make_mixed_dataset(
+            args,
+            tokenizers,
+            interactive_jsonl_path=args.train_data_jsonl,
+            placement_jsonl_path=args.placement_data_jsonl,
+            pexels_jsonl_path=args.pexels_data_jsonl,
+            interactive_base_dir=args.interactive_base_dir,
+            placement_base_dir=args.placement_base_dir,
+            pexels_base_dir=args.pexels_base_dir,
+            interactive_weight=args.interactive_weight,
+            placement_weight=args.placement_weight,
+            pexels_weight=args.pexels_weight,
+            accelerator=accelerator
+        )
+        weights_str = []
+        if args.train_data_jsonl:
+            weights_str.append(f"Interactive: {args.interactive_weight:.2f}")
+        if args.placement_data_jsonl:
+            weights_str.append(f"Placement: {args.placement_weight:.2f}")
+        if args.pexels_data_jsonl:
+            weights_str.append(f"Pexels: {args.pexels_weight:.2f}")
+        logger.info(f"Mixed dataset created with weights - {', '.join(weights_str)}")
+    elif args.dataset_mode == "pexels":
+        if not args.pexels_data_jsonl:
+            raise ValueError("pexels_data_jsonl must be provided for pexels mode")
+        train_dataset = make_pexels_dataset_subjects(args, tokenizers, accelerator)
+    elif args.dataset_mode == "placement":
+        if not args.placement_data_jsonl:
+            raise ValueError("placement_data_jsonl must be provided for placement mode")
+        train_dataset = make_placement_dataset_subjects(args, tokenizers, accelerator)
+    else:  # interactive mode
+        train_dataset = make_interactive_dataset_subjects(args, tokenizers, accelerator)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    vae_config_shift_factor = vae.config.shift_factor
+    vae_config_scaling_factor = vae.config.scaling_factor
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.resume_from_checkpoint:
+        first_epoch = global_step // num_update_steps_per_epoch
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Sanitize config for TensorBoard hparams (only allow int/float/bool/str/tensor). Others are stringified if possible; otherwise dropped
+    def _sanitize_hparams(config_dict):
+        sanitized = {}
+        for key, value in dict(config_dict).items():
+            try:
+                if value is None:
+                    continue
+                # numpy scalar types
+                if isinstance(value, (np.integer,)):
+                    sanitized[key] = int(value)
+                elif isinstance(value, (np.floating,)):
+                    sanitized[key] = float(value)
+                elif isinstance(value, (int, float, bool, str)):
+                    sanitized[key] = value
+                elif isinstance(value, Path):
+                    sanitized[key] = str(value)
+                elif isinstance(value, (list, tuple)):
+                    # stringify simple sequences; skip if fails
+                    sanitized[key] = str(value)
+                else:
+                    # best-effort stringify
+                    sanitized[key] = str(value)
+            except Exception:
+                # skip unconvertible entries
+                continue
+        return sanitized
+    if accelerator.is_main_process:
+        tracker_name = "Easy_Control_Kontext"
+        accelerator.init_trackers(tracker_name, config=_sanitize_hparams(vars(args)))
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Kontext specifics
+    vae_scale_factor = 8  # Kontext uses 8x VAE factor; pack/unpack uses additional 2x in methods
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            with accelerator.accumulate(models_to_accumulate):
+                tokens = [batch["text_ids_1"], batch["text_ids_2"]]
+                prompt_embeds, pooled_prompt_embeds, text_ids = encode_token_ids(text_encoders, tokens, accelerator)
+                prompt_embeds = prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=vae.dtype, device=accelerator.device)
+                text_ids = text_ids.to(dtype=vae.dtype, device=accelerator.device)
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                height_ = 2 * (int(pixel_values.shape[-2]) // (vae_scale_factor * 2))
+                width_ = 2 * (int(pixel_values.shape[-1]) // (vae_scale_factor * 2))
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                # Prepare latent ids for transformer (positional encodings)
+                latent_image_ids = FluxKontextPipeline._prepare_latent_image_ids(
+                    batch_size=model_input.shape[0],
+                    height=model_input.shape[2] // 2,
+                    width=model_input.shape[3] // 2,
+                    device=accelerator.device,
+                    dtype=weight_dtype,
+                )
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                packed_noisy_model_input = FluxKontextPipeline._pack_latents(
+                    noisy_model_input,
+                    batch_size=model_input.shape[0],
+                    num_channels_latents=model_input.shape[1],
+                    height=model_input.shape[2],
+                    width=model_input.shape[3],
+                )
+                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
+                    guidance = guidance.expand(model_input.shape[0])
+                else:
+                    guidance = None
+                # If kontext editing is enabled, append source image latents to the sequence
+                latent_model_input = packed_noisy_model_input
+                if args.kontext == "enable":
+                    source_pixel_values = batch["source_pixel_values"].to(dtype=vae.dtype)
+                    source_image_latents = vae.encode(source_pixel_values).latent_dist.sample()
+                    source_image_latents = (source_image_latents - vae_config_shift_factor) * vae_config_scaling_factor
+                    image_latent_h, image_latent_w = source_image_latents.shape[2:]
+                    packed_image_latents = FluxKontextPipeline._pack_latents(
+                        source_image_latents,
+                        batch_size=source_image_latents.shape[0],
+                        num_channels_latents=source_image_latents.shape[1],
+                        height=image_latent_h,
+                        width=image_latent_w,
+                    )
+                    source_image_ids = FluxKontextPipeline._prepare_latent_image_ids(
+                        batch_size=source_image_latents.shape[0],
+                        height=image_latent_h // 2,
+                        width=image_latent_w // 2,
+                        device=accelerator.device,
+                        dtype=weight_dtype,
+                    )
+                    source_image_ids[..., 0] = 1
+                    latent_model_input = torch.cat([latent_model_input, packed_image_latents], dim=1)
+                    latent_image_ids = torch.cat([latent_image_ids, source_image_ids], dim=0)
+                # Forward transformer with packed latents and ids
+                model_pred = transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1)]
+                model_pred = FluxKontextPipeline._unpack_latents(
+                    model_pred,
+                    height=int(pixel_values.shape[-2]),
+                    width=int(pixel_values.shape[-1]),
+                    vae_scale_factor=vae_scale_factor,
+                )
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                target = noise - model_input
+                loss = torch.mean((weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = (transformer.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints")
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        os.makedirs(save_path, exist_ok=True)
+                        unwrapped = accelerator.unwrap_model(transformer)
+                        peft_state = get_peft_model_state_dict(unwrapped)
+                        # Convert PEFT state dict to diffusers LoRA format for transformer
+                        diffusers_lora = convert_state_dict_to_diffusers(peft_state)
+                        save_file(diffusers_lora, os.path.join(save_path, "pytorch_lora_weights.safetensors"))
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                # Create pipeline on every rank to run validation in parallel
+                pipeline = FluxKontextPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline_args = {
+                    "prompt": args.validation_prompt,
+                    "guidance_scale": 3.5,
+                    "num_inference_steps": 20,
+                    "max_sequence_length": 128,
+                }
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=pipeline_args,
+                    step=global_step,
+                    torch_dtype=weight_dtype,
+                )
+                # Only main process saves/logs
+                if accelerator.is_main_process:
+                    save_path = os.path.join(args.output_dir, "validation")
+                    os.makedirs(save_path, exist_ok=True)
+                    save_folder = os.path.join(save_path, f"checkpoint-{global_step}")
+                    os.makedirs(save_folder, exist_ok=True)
+                    for idx, img in enumerate(images):
+                        out_path = os.path.join(save_folder, f"{idx}.jpg")
+                        save_with_retry(img, out_path)
+                del pipeline
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

util.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import random
+from collections import Counter
+import numpy as np
+from torchvision import transforms
+import cv2  # OpenCV
+import torch
+import re
+import io
+import base64
+from PIL import Image, ImageOps
+from src.pipeline_flux_kontext_control import PREFERRED_KONTEXT_RESOLUTIONS
+def get_bounding_box_from_mask(mask, padded=False):
+    mask = mask.squeeze()
+    rows, cols = torch.where(mask > 0.5)
+    if len(rows) == 0 or len(cols) == 0:
+        return (0, 0, 0, 0)
+    height, width = mask.shape
+    if padded:
+        padded_size = max(width, height)
+        if width < height:
+            offset_x = (padded_size - width) / 2
+            offset_y = 0
+        else:
+            offset_y = (padded_size - height) / 2
+            offset_x = 0
+        top_left_x = round(float((torch.min(cols).item() + offset_x) / padded_size), 3)
+        bottom_right_x = round(float((torch.max(cols).item() + offset_x) / padded_size), 3)
+        top_left_y = round(float((torch.min(rows).item() + offset_y) / padded_size), 3)
+        bottom_right_y = round(float((torch.max(rows).item() + offset_y) / padded_size), 3)
+    else:
+        offset_x = 0
+        offset_y = 0
+        top_left_x = round(float(torch.min(cols).item() / width), 3)
+        bottom_right_x = round(float(torch.max(cols).item() / width), 3)
+        top_left_y = round(float(torch.min(rows).item() / height), 3)
+        bottom_right_y = round(float(torch.max(rows).item() / height), 3)
+    return (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+def extract_bbox(text):
+    pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
+    match = re.search(pattern, text)
+    return (int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)))
+def resize_bbox(bbox, width_ratio, height_ratio):
+    x1, y1, x2, y2 = bbox
+    new_x1 = int(x1 * width_ratio)
+    new_y1 = int(y1 * height_ratio)
+    new_x2 = int(x2 * width_ratio)
+    new_y2 = int(y2 * height_ratio)
+    return (new_x1, new_y1, new_x2, new_y2)
+def tensor_to_base64(tensor, quality=80, method=6):
+    tensor = tensor.squeeze(0).clone().detach().cpu()
+    if tensor.dtype == torch.float32 or tensor.dtype == torch.float64 or tensor.dtype == torch.float16:
+        tensor *= 255
+    tensor = tensor.to(torch.uint8)
+    if tensor.ndim == 2:  # 灰度图像
+        pil_image = Image.fromarray(tensor.numpy(), 'L')
+        pil_image = pil_image.convert('RGB')
+    elif tensor.ndim == 3:
+        if tensor.shape[2] == 1:  # 单通道
+            pil_image = Image.fromarray(tensor.numpy().squeeze(2), 'L')
+            pil_image = pil_image.convert('RGB')
+        elif tensor.shape[2] == 3:  # RGB
+            pil_image = Image.fromarray(tensor.numpy(), 'RGB')
+        elif tensor.shape[2] == 4:  # RGBA
+            pil_image = Image.fromarray(tensor.numpy(), 'RGBA')
+        else:
+            raise ValueError(f"Unsupported number of channels: {tensor.shape[2]}")
+    else:
+        raise ValueError(f"Unsupported tensor dimensions: {tensor.ndim}")
+    buffered = io.BytesIO()
+    pil_image.save(buffered, format="WEBP", quality=quality, method=method, lossless=False)
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def load_and_preprocess_image(image_path, convert_to='RGB', has_alpha=False):
+    image = Image.open(image_path)
+    image = ImageOps.exif_transpose(image)
+    if image.mode == 'RGBA':
+        background = Image.new('RGBA', image.size, (255, 255, 255, 255))
+        image = Image.alpha_composite(background, image)
+    image = image.convert(convert_to)
+    image_array = np.array(image).astype(np.float32) / 255.0
+    if has_alpha and convert_to == 'RGBA':
+        image_tensor = torch.from_numpy(image_array)[None,]
+    else:
+        if len(image_array.shape) == 3 and image_array.shape[2] > 3:
+            image_array = image_array[:, :, :3]
+        image_tensor = torch.from_numpy(image_array)[None,]
+    return image_tensor
+def process_background(base64_image, convert_to='RGB', size=None):
+    image_data = read_base64_image(base64_image)
+    image = Image.open(image_data)
+    image = ImageOps.exif_transpose(image)
+    image = image.convert(convert_to)
+    # Select preferred size by closest aspect ratio, then snap to multiple_of
+    w0, h0 = image.size
+    aspect_ratio = (w0 / h0) if h0 != 0 else 1.0
+    # Choose the (w, h) whose aspect ratio is closest to the input
+    _, tw, th = min((abs(aspect_ratio - w / h), w, h) for (w, h) in PREFERRED_KONTEXT_RESOLUTIONS)
+    multiple_of = 16  # default: vae_scale_factor (8) * 2
+    tw = (tw // multiple_of) * multiple_of
+    th = (th // multiple_of) * multiple_of
+    if (w0, h0) != (tw, th):
+        image = image.resize((tw, th), resample=Image.BICUBIC)
+    image_array = np.array(image).astype(np.uint8)
+    image_tensor = torch.from_numpy(image_array)[None,]
+    return image_tensor
+def read_base64_image(base64_image):
+    if base64_image.startswith("data:image/png;base64,"):
+        base64_image = base64_image.split(",")[1]
+    elif base64_image.startswith("data:image/jpeg;base64,"):
+        base64_image = base64_image.split(",")[1]
+    elif base64_image.startswith("data:image/webp;base64,"):
+        base64_image = base64_image.split(",")[1]
+    else:
+        raise ValueError("Unsupported image format.")
+    image_data = base64.b64decode(base64_image)
+    return io.BytesIO(image_data)
+def create_alpha_mask(image_path):
+    """Create an alpha mask from the alpha channel of an image."""
+    image = Image.open(image_path)
+    image = ImageOps.exif_transpose(image)
+    mask = torch.zeros((1, image.height, image.width), dtype=torch.float32)
+    if 'A' in image.getbands():
+        alpha_channel = np.array(image.getchannel('A')).astype(np.float32) / 255.0
+        mask[0] = 1.0 - torch.from_numpy(alpha_channel)
+    return mask
+def get_mask_bbox(mask_tensor, padding=10):
+    assert len(mask_tensor.shape) == 3 and mask_tensor.shape[0] == 1
+    _, H, W = mask_tensor.shape
+    mask_2d = mask_tensor.squeeze(0)
+    y_coords, x_coords = torch.where(mask_2d > 0)
+    if len(y_coords) == 0:
+        return None
+    x_min = int(torch.min(x_coords))
+    y_min = int(torch.min(y_coords))
+    x_max = int(torch.max(x_coords))
+    y_max = int(torch.max(y_coords))
+    x_min = max(0, x_min - padding)
+    y_min = max(0, y_min - padding)
+    x_max = min(W - 1, x_max + padding)
+    y_max = min(H - 1, y_max + padding)
+    return x_min, y_min, x_max, y_max
+def tensor_to_pil(tensor):
+    tensor = tensor.squeeze(0).clone().detach().cpu()
+    if tensor.dtype in [torch.float32, torch.float64, torch.float16]:
+        if tensor.max() <= 1.0:
+            tensor *= 255
+        tensor = tensor.to(torch.uint8)
+    if tensor.ndim == 2:  # 灰度图像 [H, W]
+        return Image.fromarray(tensor.numpy(), 'L')
+    elif tensor.ndim == 3:
+        if tensor.shape[2] == 1:  # 单通道 [H, W, 1]
+            return Image.fromarray(tensor.numpy().squeeze(2), 'L')
+        elif tensor.shape[2] >= 3:  # RGB [H, W, 3]
+            return Image.fromarray(tensor.numpy(), 'RGB')
+        else:
+            raise ValueError(f"不支持的通道数: {tensor.shape[2]}")
+    else:
+        raise ValueError(f"不支持的tensor维度: {tensor.ndim}")

utils_node.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+from tqdm import trange
+import torchvision.transforms as T
+import torch.nn.functional as F
+from typing import Tuple
+import scipy.ndimage
+import cv2
+from train.src.condition.util import HWC3, common_input_validate
+def check_image_mask(image, mask, name):
+    if len(image.shape) < 4:
+        # image tensor shape should be [B, H, W, C], but batch somehow is missing
+        image = image[None,:,:,:]
+    if len(mask.shape) > 3:
+        # mask tensor shape should be [B, H, W] but we get [B, H, W, C], image may be?
+        # take first mask, red channel
+        mask = (mask[:,:,:,0])[:,:,:]
+    elif len(mask.shape) < 3:
+        # mask tensor shape should be [B, H, W] but batch somehow is missing
+        mask = mask[None,:,:]
+    if image.shape[0] > mask.shape[0]:
+        print(name, "gets batch of images (%d) but only %d masks" % (image.shape[0], mask.shape[0]))
+        if mask.shape[0] == 1:
+            print(name, "will copy the mask to fill batch")
+            mask = torch.cat([mask] * image.shape[0], dim=0)
+        else:
+            print(name, "will add empty masks to fill batch")
+            empty_mask = torch.zeros([image.shape[0] - mask.shape[0], mask.shape[1], mask.shape[2]])
+            mask = torch.cat([mask, empty_mask], dim=0)
+    elif image.shape[0] < mask.shape[0]:
+        print(name, "gets batch of images (%d) but too many (%d) masks" % (image.shape[0], mask.shape[0]))
+        mask = mask[:image.shape[0],:,:]
+    return (image, mask)
+def cv2_resize_shortest_edge(image, size):
+    h, w = image.shape[:2]
+    if h < w:
+        new_h = size
+        new_w = int(round(w / h * size))
+    else:
+        new_w = size
+        new_h = int(round(h / w * size))
+    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    return resized_image
+def apply_color(img, res=512):
+    img = cv2_resize_shortest_edge(img, res)
+    h, w = img.shape[:2]
+    input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC)
+    input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST)
+    return input_img_color
+#Color T2I like multiples-of-64, upscale methods are fixed.
+class ColorDetector:
+    def __call__(self, input_image=None, detect_resolution=512, output_type=None, **kwargs):
+        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
+        input_image = HWC3(input_image)
+        detected_map = HWC3(apply_color(input_image, detect_resolution))
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map
+class InpaintPreprocessor:
+    def preprocess(self, image, mask, black_pixel_for_xinsir_cn=False):
+        mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(image.shape[1], image.shape[2]), mode="bilinear")
+        mask = mask.movedim(1,-1).expand((-1,-1,-1,3))
+        image = image.clone()
+        if black_pixel_for_xinsir_cn:
+            masked_pixel = 0.0
+        else:
+            masked_pixel = -1.0
+        image[mask > 0.5] = masked_pixel
+        return (image,)
+class BlendInpaint:
+    def blend_inpaint(self, inpaint: torch.Tensor, original: torch.Tensor, mask, kernel: int, sigma:int, origin=None) -> Tuple[torch.Tensor]:
+        original, mask = check_image_mask(original, mask, 'Blend Inpaint')
+        if len(inpaint.shape) < 4:
+            # image tensor shape should be [B, H, W, C], but batch somehow is missing
+            inpaint = inpaint[None,:,:,:]
+        if inpaint.shape[0] < original.shape[0]:
+            print("Blend Inpaint gets batch of original images (%d) but only (%d) inpaint images" % (original.shape[0], inpaint.shape[0]))
+            original= original[:inpaint.shape[0],:,:]
+            mask = mask[:inpaint.shape[0],:,:]
+        if inpaint.shape[0] > original.shape[0]:
+            # batch over inpaint
+            count = 0
+            original_list = []
+            mask_list = []
+            origin_list = []
+            while (count < inpaint.shape[0]):
+                for i in range(original.shape[0]):
+                    original_list.append(original[i][None,:,:,:])
+                    mask_list.append(mask[i][None,:,:])
+                    if origin is not None:
+                        origin_list.append(origin[i][None,:])
+                    count += 1
+                    if count >= inpaint.shape[0]:
+                        break
+            original = torch.concat(original_list, dim=0)
+            mask = torch.concat(mask_list, dim=0)
+            if origin is not None:
+                origin = torch.concat(origin_list, dim=0)
+        if kernel % 2 == 0:
+            kernel += 1
+        transform = T.GaussianBlur(kernel_size=(kernel, kernel), sigma=(sigma, sigma))
+        ret = []
+        blurred = []
+        for i in range(inpaint.shape[0]):
+            if origin is None:
+                blurred_mask = transform(mask[i][None,None,:,:]).to(original.device).to(original.dtype)
+                blurred.append(blurred_mask[0])
+                result = torch.nn.functional.interpolate(
+                    inpaint[i][None,:,:,:].permute(0, 3, 1, 2),
+                    size=(
+                        original[i].shape[0],
+                        original[i].shape[1],
+                    )
+                ).permute(0, 2, 3, 1).to(original.device).to(original.dtype)
+            else:
+                # got mask from CutForInpaint
+                height, width, _ = original[i].shape
+                x0 = origin[i][0].item()
+                y0 = origin[i][1].item()
+                if mask[i].shape[0] < height or mask[i].shape[1] < width:
+                    padded_mask = F.pad(input=mask[i], pad=(x0, width-x0-mask[i].shape[1],
+                                                            y0, height-y0-mask[i].shape[0]), mode='constant', value=0)
+                else:
+                    padded_mask = mask[i]
+                blurred_mask = transform(padded_mask[None,None,:,:]).to(original.device).to(original.dtype)
+                blurred.append(blurred_mask[0][0])
+                result = F.pad(input=inpaint[i], pad=(0, 0, x0, width-x0-inpaint[i].shape[1],
+                                                      y0, height-y0-inpaint[i].shape[0]), mode='constant', value=0)
+                result = result[None,:,:,:].to(original.device).to(original.dtype)
+            ret.append(original[i] * (1.0 - blurred_mask[0][0][:,:,None]) + result[0] * blurred_mask[0][0][:,:,None])
+        return (torch.stack(ret), torch.stack(blurred), )
+def resize_mask(mask, shape):
+    return torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[0], shape[1]), mode="bilinear").squeeze(1)
+class JoinImageWithAlpha:
+    def join_image_with_alpha(self, image: torch.Tensor, alpha: torch.Tensor):
+        batch_size = min(len(image), len(alpha))
+        out_images = []
+        alpha = 1.0 - resize_mask(alpha, image.shape[1:])
+        for i in range(batch_size):
+           out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2))
+        result = (torch.stack(out_images),)
+        return result
+class GrowMask:
+    def expand_mask(self, mask, expand, tapered_corners):
+        c = 0 if tapered_corners else 1
+        kernel = np.array([[c, 1, c],
+                           [1, 1, 1],
+                           [c, 1, c]])
+        mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1]))
+        out = []
+        for m in mask:
+            output = m.numpy()
+            for _ in range(abs(expand)):
+                if expand < 0:
+                    output = scipy.ndimage.grey_erosion(output, footprint=kernel)
+                else:
+                    output = scipy.ndimage.grey_dilation(output, footprint=kernel)
+            output = torch.from_numpy(output)
+            out.append(output)
+        return (torch.stack(out, dim=0),)
+class InvertMask:
+    def invert(self, mask):
+        out = 1.0 - mask
+        return (out,)