Spaces:

jiuface
/

flux-dev-multi-lora

Running on Zero

App Files Files Community

jiuface commited on Sep 19

Commit

8d7d2d7

•

1 Parent(s): f93e467

bugfix

Browse files

Files changed (1) hide show

app.py +157 -62

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import gradio as gr
 import numpy as np
 import random
 import torch
 import json
 import logging
@@ -10,12 +11,17 @@ from huggingface_hub import login
 import time
 from datetime import datetime
 from io import BytesIO
-from diffusers.models.attention_processor import AttentionProcessor
 import re
 import json
 # 登录 Hugging Face Hub
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
 # 初始化
 dtype = torch.float16  # 您可以根据需要调整数据类型
@@ -145,79 +151,160 @@ def create_attention_mask(image_width, image_height, location, offset, area):
     return mask_flat
 # 自定义注意力处理器
-class CustomCrossAttentionProcessor(AttentionProcessor):
-    def __init__(self, masks, embeddings, adapter_names):
         super().__init__()
-        self.masks = masks  # 列表，包含每个角色的掩码
-        self.embeddings = embeddings  # 列表，包含每个角色的嵌入
         self.adapter_names = adapter_names  # 列表，包含每个角色的 LoRA 适配器名称
-    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, **kwargs):
         # 获取当前的 adapter_name
         adapter_name = getattr(attn, 'adapter_name', None)
         if adapter_name is None or adapter_name not in self.adapter_names:
-            # 如果没有 adapter_name，直接执行默认的注意力计算
-            return super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
         # 查找 adapter_name 对应的索引
         idx = self.adapter_names.index(adapter_name)
-        mask = self.masks[idx]
-        # 标准的注意力计算
-        batch_size, sequence_length, _ = hidden_states.shape
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
-        # 重塑以适应多头注意力
-        query = query.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, attn.head_dim).transpose(1, 2)
-        # 计算注意力得分
-        attention_scores = torch.matmul(query, key.transpose(-1, -2)) * attn.scale
-        # 应用掩码调整注意力得分
-        # 将 mask 调整为与 attention_scores 兼容的形状
-        # 假设 key_len 与 mask 的长度一致
-        mask_expanded = mask.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # (1, 1, 1, key_len)
-        # 将掩码应用于 attention_scores
-        attention_scores += mask_expanded * 1e6  # 增强对应位置的注意力
-        # 计算注意力概率
-        attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)
-        # 计算上下文向量
-        context = torch.matmul(attention_probs, value)
-        # 重塑回原始形状
-        context = context.transpose(1, 2).reshape(batch_size, -1, attn.heads * attn.head_dim)
-        # 输出投影
-        hidden_states = attn.to_out(context)
         return hidden_states
 # 替换注意力处理器的函数
-def replace_attention_processors(pipe, masks, embeddings, adapter_names):
-    custom_processor = CustomCrossAttentionProcessor(masks, embeddings, adapter_names)
-    for name, module in pipe.unet.named_modules():
-        if hasattr(module, 'attn2'):
-            # 设置 adapter_name 为模块的属性
-            module.attn2.adapter_name = getattr(module, 'adapter_name', None)
-            module.attn2.processor = custom_processor
 # 生成图像的函数
-@spaces.GPU
-@torch.inference_mode()
-def generate_image_with_embeddings(prompt_embeddings, steps, seed, cfg_scale, width, height, progress):
-    pipe.to("cuda")
-    generator = torch.Generator(device="cuda").manual_seed(seed)
     with calculateDuration("Generating image"):
         # Generate image
         generated_image = pipe(
-            prompt_embeds=prompt_embeddings,
             num_inference_steps=steps,
             guidance_scale=cfg_scale,
             width=width,
@@ -229,7 +316,8 @@ def generate_image_with_embeddings(prompt_embeddings, steps, seed, cfg_scale, wi
     return generated_image
 # 主函数
 def run_lora(prompt_bg, character_prompts_json, character_positions_json, lora_strings_json, prompt_details, cfg_scale, steps, randomize_seed, seed, width, height, lora_scale, upload_to_r2, account_id, access_key, secret_key, bucket, progress=gr.Progress(track_tqdm=True)):
     # 解析角色提示词、位置和 LoRA 字符串
@@ -260,7 +348,8 @@ def run_lora(prompt_bg, character_prompts_json, character_positions_json, lora_s
                 pipe.load_lora_weights(lora_repo, weight_name=weights, adapter_name=adapter_name)
                 adapter_names.append(adapter_name)
                 # 将 adapter_name 设置为模型的属性
-                setattr(pipe.unet, 'adapter_name', adapter_name)
             else:
                 raise ValueError("Invalid LoRA string format. Each item must have 'repo', 'weights', and 'adapter_name' keys.")
         adapter_weights = [lora_scale] * len(adapter_names)
@@ -279,22 +368,28 @@ def run_lora(prompt_bg, character_prompts_json, character_positions_json, lora_s
     # 编码提示词
     with calculateDuration("Encoding prompts"):
         # 编码背景提示词
-        bg_text_input = pipe.tokenizer(prompt_bg, return_tensors="pt").to("cuda")
-        bg_embeddings = pipe.text_encoder(bg_text_input.input_ids.to(device))[0]
         # 编码角色提示词
-        character_embeddings = []
         for prompt in character_prompts:
-            char_text_input = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")
-            char_embeddings = pipe.text_encoder(char_text_input.input_ids.to(device))[0]
-            character_embeddings.append(char_embeddings)
         # 编码互动细节提示词
-        details_text_input = pipe.tokenizer(prompt_details, return_tensors="pt").to("cuda")
-        details_embeddings = pipe.text_encoder(details_text_input.input_ids.to(device))[0]
         # 合并背景和互动细节的嵌入
-        prompt_embeddings = torch.cat([bg_embeddings, details_embeddings], dim=1)
     # 解析角色位置
     character_infos = []
@@ -309,10 +404,10 @@ def run_lora(prompt_bg, character_prompts_json, character_positions_json, lora_s
         masks.append(mask)
     # 替换注意力处理器
-    replace_attention_processors(pipe, masks, character_embeddings, adapter_names)
     # Generate image
-    final_image = generate_image_with_embeddings(prompt_embeddings, steps, seed, cfg_scale, width, height, progress)
     # 您可以在此处添加上传图片的代码
     result = {"status": "success", "message": "Image generated"}
@@ -334,7 +429,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         with gr.Column():
             prompt_bg = gr.Text(label="Background Prompt", placeholder="Enter background/scene prompt", lines=2)
             character_prompts = gr.Text(label="Character Prompts (JSON List)", placeholder='["Character 1 prompt", "Character 2 prompt"]', lines=5)
             character_positions = gr.Text(label="Character Positions (JSON List)", placeholder='["Character 1 position", "Character 2 position"]', lines=5)

 import gradio as gr
 import numpy as np
 import random
+import spaces
 import torch
 import json
 import logging
 import time
 from datetime import datetime
 from io import BytesIO
+# from diffusers.models.attention_processor import AttentionProcessor
+from diffusers.models.attention_processor import AttnProcessor2_0
+import torch.nn.functional as F
 import re
 import json
 # 登录 Hugging Face Hub
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
+import diffusers
+print(diffusers.__version__)
 # 初始化
 dtype = torch.float16  # 您可以根据需要调整数据类型
     return mask_flat
 # 自定义注意力处理器
+class CustomCrossAttentionProcessor(AttnProcessor2_0):
+    def __init__(self, masks, adapter_names):
         super().__init__()
+        self.masks = masks  # 列表，包含每个角色的掩码 (shape: [key_length])
         self.adapter_names = adapter_names  # 列表，包含每个角色的 LoRA 适配器名称
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        **kwargs,
+    ):
+        """
+        自定义的注意力处理器，用于在注意力计算中应用角色掩码。
+        参数：
+            attn: 注意力模块实例。
+            hidden_states: 输入的隐藏状态 (query)。
+            encoder_hidden_states: 编码器的隐藏状态 (key/value)。
+            attention_mask: 注意力掩码。
+            temb: 时间嵌入（可能不需要）。
+            **kwargs: 其他参数。
+        返回：
+            处理后的隐藏状态。
+        """
         # 获取当前的 adapter_name
         adapter_name = getattr(attn, 'adapter_name', None)
         if adapter_name is None or adapter_name not in self.adapter_names:
+            # 如果没有 adapter_name，或者不在我们的列表中，直接执行父类的 __call__ 方法
+            return super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask, temb, **kwargs)
         # 查找 adapter_name 对应的索引
         idx = self.adapter_names.index(adapter_name)
+        mask = self.masks[idx]  # 获取对应的掩码 (shape: [key_length])
+        # 以下是 AttnProcessor2_0 的实现，我们在适当的位置加入自定义的掩码逻辑
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        else:
+            batch_size, sequence_length, _ = hidden_states.shape
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # 如果有 encoder_hidden_states，获取其形状
+            encoder_batch_size, key_length, _ = encoder_hidden_states.shape
+        if attention_mask is not None:
+            # 处理 attention_mask，如果需要的话
+            attention_mask = attn.prepare_attention_mask(attention_mask, key_length, batch_size)
+            # attention_mask 的形状应为 (batch_size, attn.heads, query_length, key_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        else:
+            # 如果没有 attention_mask，我们创建一个全 0 的掩码
+            attention_mask = torch.zeros(
+                batch_size, attn.heads, 1, key_length, device=hidden_states.device, dtype=hidden_states.dtype
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
         query = attn.to_q(hidden_states)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # 计算原始的注意力得分
+        # 我们需要在计算注意力得分前应用掩码
+        # 但由于 PyTorch 的 scaled_dot_product_attention 接受 attention_mask 参数，我们需要调整我们的掩码
+        # 创建自定义的 attention_mask
+        # mask 的形状为 [key_length]，需要调整为 (batch_size, attn.heads, 1, key_length)
+        custom_attention_mask = mask.view(1, 1, 1, -1).to(hidden_states.device, dtype=hidden_states.dtype)
+        # 将有效位置设为 0，被掩蔽的位置设为 -1e9（对于 float16，使用 -65504）
+        mask_value = -65504.0 if hidden_states.dtype == torch.float16 else -1e9
+        custom_attention_mask = (1.0 - custom_attention_mask) * mask_value  # 有效位置为 0，无效位置为 -1e9
+        # 将自定义掩码添加到 attention_mask
+        attention_mask = attention_mask + custom_attention_mask
+        # 计算注意力
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
         return hidden_states
 # 替换注意力处理器的函数
+def replace_attention_processors(pipe, masks, adapter_names):
+    custom_processor = CustomCrossAttentionProcessor(masks, adapter_names)
+    for name, module in pipe.transformer.named_modules():
+        if hasattr(module, 'attn'):
+            module.attn.adapter_name = getattr(module, 'adapter_name', None)
+            module.attn.processor = custom_processor
+        if hasattr(module, 'cross_attn'):
+            module.cross_attn.adapter_name = getattr(module, 'adapter_name', None)
+            module.cross_attn.processor = custom_processor
 # 生成图像的函数
+def generate_image_with_embeddings(prompt_embeds, pooled_prompt_embeds, steps, seed, cfg_scale, width, height, progress):
+    pipe.to(device)
+    generator = torch.Generator(device=device).manual_seed(seed)
     with calculateDuration("Generating image"):
         # Generate image
         generated_image = pipe(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
             num_inference_steps=steps,
             guidance_scale=cfg_scale,
             width=width,
     return generated_image
 # 主函数
+@spaces.GPU
+@torch.inference_mode()
 def run_lora(prompt_bg, character_prompts_json, character_positions_json, lora_strings_json, prompt_details, cfg_scale, steps, randomize_seed, seed, width, height, lora_scale, upload_to_r2, account_id, access_key, secret_key, bucket, progress=gr.Progress(track_tqdm=True)):
     # 解析角色提示词、位置和 LoRA 字符串
                 pipe.load_lora_weights(lora_repo, weight_name=weights, adapter_name=adapter_name)
                 adapter_names.append(adapter_name)
                 # 将 adapter_name 设置为模型的属性
+                setattr(pipe.transformer, 'adapter_name', adapter_name)
             else:
                 raise ValueError("Invalid LoRA string format. Each item must have 'repo', 'weights', and 'adapter_name' keys.")
         adapter_weights = [lora_scale] * len(adapter_names)
     # 编码提示词
     with calculateDuration("Encoding prompts"):
         # 编码背景提示词
+        bg_text_input = pipe.tokenizer(prompt_bg, return_tensors="pt").to(device)
+        bg_prompt_embeds = pipe.text_encoder_2(bg_text_input.input_ids.to(device))[0]
+        bg_pooled_embeds = pipe.text_encoder(bg_text_input.input_ids.to(device)).pooler_output
         # 编码角色提示词
+        character_prompt_embeds = []
+        character_pooled_embeds = []
         for prompt in character_prompts:
+            char_text_input = pipe.tokenizer(prompt, return_tensors="pt").to(device)
+            char_prompt_embeds = pipe.text_encoder_2(char_text_input.input_ids.to(device))[0]
+            char_pooled_embeds = pipe.text_encoder(char_text_input.input_ids.to(device)).pooler_output
+            character_prompt_embeds.append(char_prompt_embeds)
+            character_pooled_embeds.append(char_pooled_embeds)
         # 编码互动细节提示词
+        details_text_input = pipe.tokenizer(prompt_details, return_tensors="pt").to(device)
+        details_prompt_embeds = pipe.text_encoder_2(details_text_input.input_ids.to(device))[0]
+        details_pooled_embeds = pipe.text_encoder(details_text_input.input_ids.to(device)).pooler_output
         # 合并背景和互动细节的嵌入
+        prompt_embeds = torch.cat([bg_prompt_embeds, details_prompt_embeds], dim=1)
+        pooled_prompt_embeds = torch.cat([bg_pooled_embeds, details_pooled_embeds], dim=1)
     # 解析角色位置
     character_infos = []
         masks.append(mask)
     # 替换注意力处理器
+    replace_attention_processors(pipe, masks, adapter_names)
     # Generate image
+    final_image = generate_image_with_embeddings(prompt_embeddings, pooled_prompt_embeds, steps, seed, cfg_scale, width, height, progress)
     # 您可以在此处添加上传图片的代码
     result = {"status": "success", "message": "Image generated"}
     with gr.Row():
         with gr.Column():
             prompt_bg = gr.Text(label="Background Prompt", placeholder="Enter background/scene prompt", lines=2)
             character_prompts = gr.Text(label="Character Prompts (JSON List)", placeholder='["Character 1 prompt", "Character 2 prompt"]', lines=5)
             character_positions = gr.Text(label="Character Positions (JSON List)", placeholder='["Character 1 position", "Character 2 position"]', lines=5)